/** * Copyright 2002-2017 Evgeny Gryaznov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.textmapper.tool.compiler; import org.textmapper.lapg.LapgCore; import org.textmapper.lapg.api.*; import org.textmapper.lapg.api.ast.AstRawType; import org.textmapper.lapg.api.ast.AstType; import org.textmapper.lapg.api.builder.GrammarBuilder; import org.textmapper.lapg.api.regex.RegexContext; import org.textmapper.lapg.api.regex.RegexMatcher; import org.textmapper.lapg.api.regex.RegexParseException; import org.textmapper.lapg.api.regex.RegexPart; import org.textmapper.tool.parser.TMTree; import org.textmapper.tool.parser.ast.*; import java.util.*; /** * evgeny, 1/21/13 */ public class TMLexerCompiler { private final TMTree<TmaInput> tree; private final TMResolver resolver; private final GrammarBuilder builder; private final Map<TmaLexeme, RuleAttributes> attributes = new HashMap<>(); public TMLexerCompiler(TMResolver resolver) { this.resolver = resolver; this.tree = resolver.getTree(); this.builder = resolver.getBuilder(); } private void error(ITmaNode n, String message) { resolver.error(n, message); } private List<LexerState> resolveStates(TmaStartConditions conditions) { List<LexerState> result = new ArrayList<>(); List<TmaStateref> refs = conditions.getStaterefListCommaSeparated(); if (refs == null) { return resolver.allStates(); } for (TmaStateref ref : refs) { LexerState applicable = resolver.getState(ref.getName()); if (applicable != null) { result.add(applicable); } else { error(ref, ref.getName() + " cannot be resolved"); } } if (result.isEmpty()) { result.addAll(resolver.allStates()); } return result; } private LexerRule getClassRule(Map<LexerRule, RegexMatcher> classMatchers, TmaLexeme l, RegexPart regex) { LexerRule result = null; TmaLexemeAttrs attrs = l.getAttrs(); boolean isClass = attrs != null && attrs.getKind() == TmaLexemeAttribute.LCLASS; if (regex.isConstant() && !isClass) { for (LexerRule rule : classMatchers.keySet()) { TmaLexeme astClassLexeme = (TmaLexeme) ((DerivedSourceElement) rule).getOrigin(); if (!attributes.get(astClassLexeme).canBeClassFor(attributes.get(l))) { continue; } RegexMatcher m = classMatchers.get(rule); if (m.matches(regex.getConstantValue())) { if (result != null) { error(l, "regex matches two classes `" + result.getSymbol().getNameText() + "' and `" + rule.getSymbol().getNameText() + "', using first"); } else { result = rule; } } } } return result; } public int getLexerRuleKind(TmaLexemeAttrs attr) { if (attr == null) { return LexerRule.KIND_NONE; } switch (attr.getKind()) { case LCLASS: return LexerRule.KIND_CLASS; case LLAYOUT: return LexerRule.KIND_LAYOUT; case LSOFT: return LexerRule.KIND_SOFT; case LSPACE: return LexerRule.KIND_SPACE; } return LexerRule.KIND_NONE; } private void collectAttributes(List<LexerState> states, ITmaLexerPart part) { if (part instanceof TmaLexeme) { TmaStartConditions conditions = ((TmaLexeme) part).getStartConditions(); if (conditions != null) { states = resolveStates(conditions); } attributes.put((TmaLexeme) part, new RuleAttributes(states)); } else if (part instanceof TmaStartConditionsScope) { TmaStartConditionsScope scope = (TmaStartConditionsScope) part; states = resolveStates(scope.getStartConditions()); for (ITmaLexerPart p : scope.getLexerParts()) { collectAttributes(states, p); } } } public void compile() { Map<Terminal, Terminal> softToClass = new HashMap<>(); Set<Terminal> nonSoft = new HashSet<>(); // Step 1. Collect states. List<LexerState> defaultStates = resolver.inclusiveStates(); for (ITmaLexerPart clause : tree.getRoot().getLexer()) { collectAttributes(defaultStates, clause); } // Step 2. Process class lexical rules. RegexContext context = resolver.createRegexContext(); Map<LexerRule, RegexMatcher> classMatchers = new LinkedHashMap<>(); for (TmaLexeme lexeme : resolver.getLexerParts(TmaLexeme.class)) { TmaLexemeAttrs attrs = lexeme.getAttrs(); if (attrs == null || attrs.getKind() != TmaLexemeAttribute.LCLASS) { continue; } if (lexeme.getPattern() == null) { error(lexeme, "class lexeme rule without regular expression, ignored"); continue; } Symbol s = resolver.getSymbol(lexeme.getName().getID()); if (!(s instanceof Terminal)) { // not a terminal? already reported, ignore continue; } Terminal classTerm = (Terminal) s; nonSoft.add(classTerm); RegexPart regex; RegexMatcher matcher; try { regex = LapgCore.parse(s.getNameText(), lexeme.getPattern().getRegexp()); matcher = LapgCore.createMatcher(regex, context); } catch (RegexParseException e) { error(lexeme.getPattern(), e.getMessage()); continue; } int priority = lexeme.getPriority() == null ? 0 : lexeme.getPriority(); List<LexerState> states = attributes.get(lexeme).getApplicableInStates(); if (states.isEmpty()) { error(lexeme, "lexer rule is never applicable, ignored"); continue; } LexerRule liLexerRule = builder.addLexerRule(LexerRule.KIND_CLASS, classTerm, regex, states, priority, null, lexeme); classMatchers.put(liLexerRule, matcher); TMDataUtil.putCode(liLexerRule, lexeme.getCommand()); } // Step 3. Process other lexical rules. Match soft lexemes with their classes. for (TmaLexeme lexeme : resolver.getLexerParts(TmaLexeme.class)) { TmaLexemeAttrs attrs = lexeme.getAttrs(); int kind = getLexerRuleKind(attrs); if (kind == LexerRule.KIND_CLASS) { continue; } Symbol s = resolver.getSymbol(lexeme.getName().getID()); if (!(s instanceof Terminal)) { // not a terminal? already reported, ignore continue; } Terminal term = (Terminal) s; boolean isSoft = (kind == LexerRule.KIND_SOFT); if (isSoft && nonSoft.contains(term)) { error(lexeme, "redeclaration of non-soft terminal: " + lexeme.getName()); continue; } else if (!isSoft) { if (softToClass.containsKey(term)) { error(lexeme, "redeclaration of soft terminal: " + lexeme.getName()); continue; } nonSoft.add(term); } if (lexeme.getPattern() == null) { if (isSoft) { error(lexeme, "soft lexeme rule `" + lexeme.getName().getID() + "' should have a regular expression"); } continue; } String name = lexeme.getName().getID(); RegexPart regex; try { regex = LapgCore.parse(name, lexeme.getPattern().getRegexp()); } catch (RegexParseException e) { error(lexeme.getPattern(), e.getMessage()); continue; } if (isSoft && lexeme.getCommand() != null) { // TODO Note: soft lexeme is able to override the code error(lexeme.getCommand(), "soft lexeme rule `" + lexeme.getName().getID() + "' cannot have a semantic action"); } LexerRule classRule = getClassRule(classMatchers, lexeme, regex); if (isSoft) { if (classRule == null) { error(lexeme, "soft lexeme rule `" + name + "' " + (regex.isConstant() ? "doesn't match any class rule" : "should have a constant regexp")); continue; } Terminal softClass = classRule.getSymbol(); String type = getRawTypeText(lexeme.getRawType()); String classtype = getSymbolType(softClass); if (type != null && !type.equals(classtype)) { error(lexeme, "soft terminal `" + name + "' overrides base type: expected `" + (classtype == null ? "<no type>" : classtype) + "', found `" + type + "'"); continue; } final Terminal oldClass = softToClass.get(term); if (oldClass != null && oldClass != softClass) { error(lexeme, "redeclaration of soft class for `" + term.getNameText() + "': found " + softClass.getNameText() + " instead of " + oldClass.getNameText()); continue; } else if (oldClass == null) { builder.makeSoft(term, softClass); softToClass.put(term, softClass); } // TODO check applicable states } int priority = lexeme.getPriority() == null ? 0 : lexeme.getPriority(); List<LexerState> states = attributes.get(lexeme).getApplicableInStates(); if (states.isEmpty()) { error(lexeme, "lexer rule is never applicable, ignored"); continue; } LexerRule liLexerRule = builder.addLexerRule(kind, term, regex, states, priority, classRule, lexeme); TMDataUtil.putCode(liLexerRule, lexeme.getCommand()); } } private static String getSymbolType(Symbol s) { final AstType type = s.getType(); return type instanceof AstRawType ? ((AstRawType) type).getRawType() : null; } private static String getRawTypeText(TmaRawType type) { if (type == null) { return null; } String text = type.getText(); return text.substring(1, text.length() - 1); } private static class RuleAttributes { private final List<LexerState> applicableInStates; public RuleAttributes(List<LexerState> applicableInStates) { this.applicableInStates = applicableInStates; } public List<LexerState> getApplicableInStates() { return applicableInStates; } public boolean canBeClassFor(RuleAttributes l) { if (applicableInStates.size() != l.getApplicableInStates().size()) { return false; } Collection<LexerState> applicableInStatesSet = applicableInStates.size() > 4 ? new HashSet<>(applicableInStates) : applicableInStates; return applicableInStatesSet.containsAll(l.getApplicableInStates()); } } }