/***************************************************************************** * Copyright (C) Codehaus.org * * ------------------------------------------------------------------------- * * Licensed under the Apache License, Version 2.0 (the "License"); * * you may not use this file except in compliance with the License. * * You may obtain a copy of the License at * * * * http://www.apache.org/licenses/LICENSE-2.0 * * * * Unless required by applicable law or agreed to in writing, software * * distributed under the License is distributed on an "AS IS" BASIS, * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * * See the License for the specific language governing permissions and * * limitations under the License. * *****************************************************************************/ package net.ion.rosetta; import java.util.List; import java.util.Stack; import net.ion.rosetta.annotations.Private; import net.ion.rosetta.functors.Unary; import net.ion.rosetta.pattern.CharPredicate; import net.ion.rosetta.pattern.Pattern; import net.ion.rosetta.pattern.Patterns; import net.ion.rosetta.util.Lists; import net.ion.rosetta.util.Objects; /** * Processes indentation based lexical structure according to the <a href="http://en.wikipedia.org/wiki/Off-side_rule">Off-side rule</a>. * * @author Ben Yu */ public final class Indentation { /** * A {@link CharPredicate} that returns true only if the character isn't line feed and {@link Character#isWhitespace(char)} returns true. */ static final CharPredicate INLINE_WHITESPACE = new CharPredicate() { public boolean isChar(char c) { return c != '\n' && Character.isWhitespace(c); } @Override public String toString() { return "whitespace"; } }; /** * A {@link Pattern} object that matches a line continuation. i.e. a backslash character ({@code '\'}) followed by some whitespaces and ended by a line feed character ({@code '\n'}). Is useful if the line feed character plays a role in the syntax (as in indentation-sensitive languages) and line continuation is supported. */ static final Pattern LINE_CONTINUATION = Patterns.sequence(Patterns.isChar('\\'), Patterns.many(INLINE_WHITESPACE), Patterns.isChar('\n')); /** * A {@link Pattern} object that matches one or more whitespace characters or line continuations, where the line feed character ({@code '\n'}) is escaped by the backslash character ({@code '\'}). */ static final Pattern INLINE_WHITESPACES = Patterns.many1(INLINE_WHITESPACE); /** * A {@link Parser} that recognizes 1 or more whitespace characters on the same line. Line continutation (escaped by a backslash character {@code '\'}) is considered the same line. */ public static final Parser<Void> WHITESPACES = Scanners.pattern(INLINE_WHITESPACES.or(LINE_CONTINUATION).many1(), "whitespaces"); @Private static enum Punctuation { INDENT, OUTDENT, LF } private final Object indent; private final Object outdent; /** * Creates an {@link Indentation} object that uses {@code indent} and {@code outdent} as the token values for indentation and outdentation. */ public Indentation(Object indent, Object outdent) { this.indent = indent; this.outdent = outdent; } /** * Creates a {@link Indentation} object that generates default indent and outdent tokens. */ public Indentation() { this(Punctuation.INDENT, Punctuation.OUTDENT); } /** A {@link Parser} that recognizes the generated {@code indent} token. */ public Parser<Token> indent() { return token(indent); } /** A {@link Parser} that recognizes the generated {@code outdent} token. */ public Parser<Token> outdent() { return token(outdent); } /** * A {@link Parser} that greedily runs {@code tokenizer}, and translates line feed characters ({@code '\n'}) to {@code indent} and {@code outdent} tokens. Return values are wrapped in {@link Token} objects and collected in a {@link List}. Patterns recognized by {@code delim} are ignored. */ public Parser<List<Token>> lexer(Parser<?> tokenizer, Parser<?> delim) { Parser<?> lf = Scanners.isChar('\n').retn(Punctuation.LF); return Parsers.plus(tokenizer, lf).lexer(delim).map(new Unary<List<Token>>() { public List<Token> map(List<Token> tokens) { return analyzeIndentations(tokens, Punctuation.LF); } @Override public String toString() { return "lexer"; } }); } private static Parser<Token> token(Object value) { return Parsers.token(InternalFunctors.tokenWithSameValue(value)); } /** * Analyzes indentation by looking at the first token after each {@code lf} and inserting {@code indent} and {@code outdent} tokens properly. */ List<Token> analyzeIndentations(List<Token> tokens, Object lf) { if (tokens.isEmpty()) { return tokens; } int size = tokens.size(); List<Token> result = Lists.arrayList(size + size / 16); Stack<Integer> indentations = new Stack<Integer>(); boolean freshLine = true; int lfIndex = 0; for (Token token : tokens) { if (freshLine) { int indentation = token.index() - lfIndex; if (Objects.equals(token.value(), lf)) { // if first token on a line is lf, indentation is ignored. indentation = 0; } newLine(token, indentations, indentation, result); } if (Objects.equals(token.value(), lf)) { freshLine = true; lfIndex = token.index() + token.length(); } else { freshLine = false; result.add(token); } } Token lastToken = tokens.get(tokens.size() - 1); int endIndex = lastToken.index() + lastToken.length(); Token outdentToken = pseudoToken(endIndex, outdent); for (int i = 0; i < indentations.size() - 1; i++) { // add outdent for every remaining indentation except the first one result.add(outdentToken); } return result; } private void newLine(Token token, Stack<Integer> indentations, int indentation, List<Token> result) { for (;;) { if (indentations.isEmpty()) { indentations.add(indentation); return; } int previousIndentation = indentations.peek(); if (previousIndentation < indentation) { // indent indentations.push(indentation); result.add(pseudoToken(token.index(), indent)); return; } else if (previousIndentation > indentation) { // outdent indentations.pop(); if (indentations.isEmpty()) { return; } result.add(pseudoToken(token.index(), outdent)); continue; } return; } } private static Token pseudoToken(int index, Object value) { return new Token(index, 0, value); } }