Lexer.java example

Explorer
test-master
- bazel-master
// Copyright 2014 The Bazel Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.google.devtools.build.lib.syntax;

import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.devtools.build.lib.concurrent.ThreadSafety.Immutable;
import com.google.devtools.build.lib.events.Event;
import com.google.devtools.build.lib.events.EventHandler;
import com.google.devtools.build.lib.events.Location;
import com.google.devtools.build.lib.profiler.Profiler;
import com.google.devtools.build.lib.profiler.ProfilerTask;
import com.google.devtools.build.lib.util.Pair;
import com.google.devtools.build.lib.vfs.PathFragment;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Stack;

/**
 * A tokenizer for the BUILD language.
 * <p>
 * See: <a href="https://docs.python.org/2/reference/lexical_analysis.html"/>
 * for some details.
 * <p>
 * Since BUILD files are small, we just tokenize the entire file a-priori
 * instead of interleaving scanning with parsing.
 */
public final class Lexer {

  // Characters that can come immediately prior to an '=' character to generate
  // a different token
  private static final ImmutableMap<Character, TokenKind> EQUAL_TOKENS =
      ImmutableMap.<Character, TokenKind>builder()
          .put('=', TokenKind.EQUALS_EQUALS)
          .put('!', TokenKind.NOT_EQUALS)
          .put('>', TokenKind.GREATER_EQUALS)
          .put('<', TokenKind.LESS_EQUALS)
          .put('+', TokenKind.PLUS_EQUALS)
          .put('-', TokenKind.MINUS_EQUALS)
          .put('*', TokenKind.STAR_EQUALS)
          .put('/', TokenKind.SLASH_EQUALS)
          .put('%', TokenKind.PERCENT_EQUALS)
          .build();

  private final EventHandler eventHandler;

  // Input buffer and position
  private final char[] buffer;
  private int pos;

  /**
   * The part of the location information that is common to all LexerLocation
   * instances created by this Lexer.  Factored into a separate object so that
   * many Locations instances can share the same information as compactly as
   * possible, without closing over a Lexer instance.
   */
  private static class LocationInfo {
    final LineNumberTable lineNumberTable;
    final PathFragment filename;
    LocationInfo(PathFragment filename, LineNumberTable lineNumberTable) {
      this.filename = filename;
      this.lineNumberTable = lineNumberTable;
    }
  }

  private final LocationInfo locationInfo;

  // The stack of enclosing indentation levels; always contains '0' at the
  // bottom.
  private final Stack<Integer> indentStack = new Stack<>();

  private final List<Token> tokens;

  // The number of unclosed open-parens ("(", '{', '[') at the current point in
  // the stream. Whitespace is handled differently when this is nonzero.
  private int openParenStackDepth = 0;

  private boolean containsErrors;

  /**
   * Constructs a lexer which tokenizes the contents of the specified InputBuffer. Any errors during
   * lexing are reported on "handler".
   */
  public Lexer(
      ParserInputSource input, EventHandler eventHandler, LineNumberTable lineNumberTable) {
    this.buffer = input.getContent();
    // Empirical measurements show roughly 1 token per 8 characters in buffer.
    this.tokens = Lists.newArrayListWithExpectedSize(buffer.length / 8);
    this.pos = 0;
    this.eventHandler = eventHandler;
    this.locationInfo = new LocationInfo(input.getPath(), lineNumberTable);

    indentStack.push(0);
    long startTime = Profiler.nanoTimeMaybe();
    tokenize();
    Profiler.instance().logSimpleTask(startTime, ProfilerTask.SKYLARK_LEXER, getFilename());
  }

  public Lexer(ParserInputSource input, EventHandler eventHandler) {
    this(input, eventHandler, LineNumberTable.create(input.getContent(), input.getPath()));
  }

  /**
   * Returns the filename from which the lexer's input came. Returns an empty value if the input
   * came from a string.
   */
  public PathFragment getFilename() {
    return locationInfo.filename != null ? locationInfo.filename : PathFragment.EMPTY_FRAGMENT;
  }

  /**
   * Returns true if there were errors during scanning of this input file or
   * string. The Lexer may attempt to recover from errors, but clients should
   * not rely on the results of scanning if this flag is set.
   */
  public boolean containsErrors() {
    return containsErrors;
  }

  /**
   * Returns the (mutable) list of tokens generated by the Lexer.
   */
  public List<Token> getTokens() {
    return tokens;
  }

  private void popParen() {
    if (openParenStackDepth == 0) {
      error("indentation error");
    } else {
      openParenStackDepth--;
    }
  }

  private void error(String message) {
     error(message, pos - 1, pos - 1);
  }

  private void error(String message, int start, int end)  {
    this.containsErrors = true;
    eventHandler.handle(Event.error(createLocation(start, end), message));
  }

  Location createLocation(int start, int end) {
    return new LexerLocation(locationInfo, start, end);
  }

  // Don't use an inner class as we don't want to close over the Lexer, only
  // the LocationInfo.
  @Immutable
  private static final class LexerLocation extends Location {

    private final LineNumberTable lineNumberTable;

    LexerLocation(LocationInfo locationInfo, int start, int end) {
      super(start, end);
      this.lineNumberTable = locationInfo.lineNumberTable;
    }

    @Override
    public PathFragment getPath() {
      PathFragment path = lineNumberTable.getPath(getStartOffset());
      return path;
    }

    @Override
    public LineAndColumn getStartLineAndColumn() {
      return lineNumberTable.getLineAndColumn(getStartOffset());
    }

    @Override
    public LineAndColumn getEndLineAndColumn() {
      return lineNumberTable.getLineAndColumn(getEndOffset());
    }


    @Override
    public int hashCode() {
      return Objects.hash(lineNumberTable, internalHashCode());
    }

    @Override
    public boolean equals(Object other) {
      if (other == null || !other.getClass().equals(getClass())) {
        return false;
      }
      LexerLocation that = (LexerLocation) other;
      return internalEquals(that) && Objects.equals(this.lineNumberTable, that.lineNumberTable);
    }
  }

  /** invariant: symbol positions are half-open intervals. */
  private void addToken(Token s) {
    tokens.add(s);
  }

  /**
   * Parses an end-of-line sequence, handling statement indentation correctly.
   *
   * <p>UNIX newlines are assumed (LF). Carriage returns are always ignored.
   *
   * <p>ON ENTRY: 'pos' is the index of the char after '\n'.
   * ON EXIT: 'pos' is the index of the next non-space char after '\n'.
   */
  private void newline() {
    if (openParenStackDepth > 0) {
      newlineInsideExpression(); // in an expression: ignore space
    } else {
      newlineOutsideExpression(); // generate NEWLINE/INDENT/OUTDENT tokens
    }
  }

  private void newlineInsideExpression() {
    while (pos < buffer.length) {
      switch (buffer[pos]) {
        case ' ': case '\t': case '\r':
          pos++;
          break;
        default:
          return;
      }
    }
  }

  private void newlineOutsideExpression() {
    if (pos > 1) { // skip over newline at start of file
      addToken(new Token(TokenKind.NEWLINE, pos - 1, pos));
    }

    // we're in a stmt: suck up space at beginning of next line
    int indentLen = 0;
    while (pos < buffer.length) {
      char c = buffer[pos];
      if (c == ' ') {
        indentLen++;
        pos++;
      } else if (c == '\r') {
        pos++;
      } else if (c == '\t') {
        indentLen += 8 - indentLen % 8;
        pos++;
      } else if (c == '\n') { // entirely blank line: discard
        indentLen = 0;
        pos++;
      } else if (c == '#') { // line containing only indented comment
        int oldPos = pos;
        while (pos < buffer.length && c != '\n') {
          c = buffer[pos++];
        }
        addToken(new Token(TokenKind.COMMENT, oldPos, pos - 1, bufferSlice(oldPos, pos - 1)));
        indentLen = 0;
      } else { // printing character
        break;
      }
    }

    if (pos == buffer.length) {
      indentLen = 0;
    } // trailing space on last line

    int peekedIndent = indentStack.peek();
    if (peekedIndent < indentLen) { // push a level
      indentStack.push(indentLen);
      addToken(new Token(TokenKind.INDENT, pos - 1, pos));

    } else if (peekedIndent > indentLen) { // pop one or more levels
      while (peekedIndent > indentLen) {
        indentStack.pop();
        addToken(new Token(TokenKind.OUTDENT, pos - 1, pos));
        peekedIndent = indentStack.peek();
      }

      if (peekedIndent < indentLen) {
        error("indentation error");
      }
    }
  }

  /**
   * Returns true if current position is in the middle of a triple quote
   * delimiter (3 x quot), and advances 'pos' by two if so.
   */
  private boolean skipTripleQuote(char quot) {
    if (pos + 1 < buffer.length && buffer[pos] == quot && buffer[pos + 1] == quot) {
      pos += 2;
      return true;
    } else {
      return false;
    }
  }

  /**
   * Scans a string literal delimited by 'quot', containing escape sequences.
   *
   * <p>ON ENTRY: 'pos' is 1 + the index of the first delimiter
   * ON EXIT: 'pos' is 1 + the index of the last delimiter.
   *
   * @return the string-literal token.
   */
  private Token escapedStringLiteral(char quot, boolean isRaw) {
    boolean inTriplequote = skipTripleQuote(quot);

    int oldPos = pos - 1;
    // more expensive second choice that expands escaped into a buffer
    StringBuilder literal = new StringBuilder();
    while (pos < buffer.length) {
      char c = buffer[pos];
      pos++;
      switch (c) {
        case '\n':
          if (inTriplequote) {
            literal.append(c);
            break;
          } else {
            error("unterminated string literal at eol", oldPos, pos);
            newline();
            return new Token(TokenKind.STRING, oldPos, pos, literal.toString());
          }
        case '\\':
          if (pos == buffer.length) {
            error("unterminated string literal at eof", oldPos, pos);
            return new Token(TokenKind.STRING, oldPos, pos, literal.toString());
          }
          if (isRaw) {
            // Insert \ and the following character.
            // As in Python, it means that a raw string can never end with a single \.
            literal.append('\\');
            if (pos + 1 < buffer.length && buffer[pos] == '\r' && buffer[pos + 1] == '\n') {
              literal.append("\n");
              pos += 2;
            } else if (buffer[pos] == '\r' || buffer[pos] == '\n') {
              literal.append("\n");
              pos += 1;
            } else {
              literal.append(buffer[pos]);
              pos += 1;
            }
            break;
          }
          c = buffer[pos];
          pos++;
          switch (c) {
            case '\r':
              if (pos < buffer.length && buffer[pos] == '\n') {
                pos += 1;
                break;
              } else {
                break;
              }
            case '\n':
              // ignore end of line character
              break;
            case 'n':
              literal.append('\n');
              break;
            case 'r':
              literal.append('\r');
              break;
            case 't':
              literal.append('\t');
              break;
            case '\\':
              literal.append('\\');
              break;
            case '\'':
              literal.append('\'');
              break;
            case '"':
              literal.append('"');
              break;
            case '0': case '1': case '2': case '3':
            case '4': case '5': case '6': case '7': { // octal escape
              int octal = c - '0';
              if (pos < buffer.length) {
                c = buffer[pos];
                if (c >= '0' && c <= '7') {
                  pos++;
                  octal = (octal << 3) | (c - '0');
                  if (pos < buffer.length) {
                    c = buffer[pos];
                    if (c >= '0' && c <= '7') {
                      pos++;
                      octal = (octal << 3) | (c - '0');
                    }
                  }
                }
              }
              literal.append((char) (octal & 0xff));
              break;
            }
            case 'a': case 'b': case 'f': case 'N': case 'u': case 'U': case 'v': case 'x':
              // exists in Python but not implemented in Blaze => error
              error("escape sequence not implemented: \\" + c, oldPos, pos);
              break;
            default:
              // unknown char escape => "\literal"
              literal.append('\\');
              literal.append(c);
              break;
          }
          break;
        case '\'':
        case '"':
          if (c != quot
              || (inTriplequote && !skipTripleQuote(quot))) {
            // Non-matching quote, treat it like a regular char.
            literal.append(c);
          } else {
            // Matching close-delimiter, all done.
            return new Token(TokenKind.STRING, oldPos, pos, literal.toString());
          }
          break;
        default:
          literal.append(c);
          break;
      }
    }
    error("unterminated string literal at eof", oldPos, pos);
    return new Token(TokenKind.STRING, oldPos, pos, literal.toString());
  }

  /**
   * Scans a string literal delimited by 'quot'.
   *
   * <ul>
   * <li> ON ENTRY: 'pos' is 1 + the index of the first delimiter
   * <li> ON EXIT: 'pos' is 1 + the index of the last delimiter.
   * </ul>
   *
   * @param isRaw if true, do not escape the string.
   * @return the string-literal token.
   */
  private Token stringLiteral(char quot, boolean isRaw) {
    int oldPos = pos - 1;

    // Don't even attempt to parse triple-quotes here.
    if (skipTripleQuote(quot)) {
      pos -= 2;
      return escapedStringLiteral(quot, isRaw);
    }

    // first quick optimistic scan for a simple non-escaped string
    while (pos < buffer.length) {
      char c = buffer[pos++];
      switch (c) {
        case '\n':
          error("unterminated string literal at eol", oldPos, pos);
          Token t = new Token(TokenKind.STRING, oldPos, pos,
                              bufferSlice(oldPos + 1, pos - 1));
          newline();
          return t;
        case '\\':
          if (isRaw) {
            if (pos + 1 < buffer.length && buffer[pos] == '\r' && buffer[pos + 1] == '\n') {
              // There was a CRLF after the newline. No shortcut possible, since it needs to be
              // transformed into a single LF.
              pos = oldPos + 1;
              return escapedStringLiteral(quot, true);
            } else {
              pos++;
              break;
            }
          }
          // oops, hit an escape, need to start over & build a new string buffer
          pos = oldPos + 1;
          return escapedStringLiteral(quot, false);
        case '\'':
        case '"':
          if (c == quot) {
            // close-quote, all done.
            return new Token(TokenKind.STRING, oldPos, pos,
                             bufferSlice(oldPos + 1, pos - 1));
          }
      }
    }

    // If the current position is beyond the end of the file, need to move it backwards
    // Possible if the file ends with `r"\` (unterminated raw string literal with a backslash)
    if (pos > buffer.length) {
      pos = buffer.length;
    }

    error("unterminated string literal at eof", oldPos, pos);
    return new Token(TokenKind.STRING, oldPos, pos,
                     bufferSlice(oldPos + 1, pos));
  }

  private static final Map<String, TokenKind> keywordMap = new HashMap<>();

  static {
    keywordMap.put("and", TokenKind.AND);
    keywordMap.put("as", TokenKind.AS);
    keywordMap.put("assert", TokenKind.ASSERT);
    keywordMap.put("break", TokenKind.BREAK);
    keywordMap.put("class", TokenKind.CLASS);
    keywordMap.put("continue", TokenKind.CONTINUE);
    keywordMap.put("def", TokenKind.DEF);
    keywordMap.put("del", TokenKind.DEL);
    keywordMap.put("elif", TokenKind.ELIF);
    keywordMap.put("else", TokenKind.ELSE);
    keywordMap.put("except", TokenKind.EXCEPT);
    keywordMap.put("finally", TokenKind.FINALLY);
    keywordMap.put("for", TokenKind.FOR);
    keywordMap.put("from", TokenKind.FROM);
    keywordMap.put("global", TokenKind.GLOBAL);
    keywordMap.put("if", TokenKind.IF);
    keywordMap.put("import", TokenKind.IMPORT);
    keywordMap.put("in", TokenKind.IN);
    keywordMap.put("is", TokenKind.IS);
    keywordMap.put("lambda", TokenKind.LAMBDA);
    keywordMap.put("nonlocal", TokenKind.NONLOCAL);
    keywordMap.put("not", TokenKind.NOT);
    keywordMap.put("or", TokenKind.OR);
    keywordMap.put("pass", TokenKind.PASS);
    keywordMap.put("raise", TokenKind.RAISE);
    keywordMap.put("return", TokenKind.RETURN);
    keywordMap.put("try", TokenKind.TRY);
    keywordMap.put("while", TokenKind.WHILE);
    keywordMap.put("with", TokenKind.WITH);
    keywordMap.put("yield", TokenKind.YIELD);
  }

  /**
   * Scans an identifier or keyword.
   *
   * <p>ON ENTRY: 'pos' is 1 + the index of the first char in the identifier.
   * ON EXIT: 'pos' is 1 + the index of the last char in the identifier.
   *
   * @return the identifier or keyword token.
   */
  private Token identifierOrKeyword() {
    int oldPos = pos - 1;
    String id = scanIdentifier();
    TokenKind kind = keywordMap.get(id);
    return (kind == null)
        ? new Token(TokenKind.IDENTIFIER, oldPos, pos, id)
        : new Token(kind, oldPos, pos, null);
  }

  private String scanIdentifier() {
    int oldPos = pos - 1;
    while (pos < buffer.length) {
      switch (buffer[pos]) {
        case '_':
        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
        case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
        case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
        case 's': case 't': case 'u': case 'v': case 'w': case 'x':
        case 'y': case 'z':
        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
        case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
        case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
        case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
        case 'Y': case 'Z':
        case '0': case '1': case '2': case '3': case '4': case '5':
        case '6': case '7': case '8': case '9':
          pos++;
          break;
       default:
          return bufferSlice(oldPos, pos);
      }
    }
    return bufferSlice(oldPos, pos);
  }

  private String scanInteger() {
    int oldPos = pos - 1;
    while (pos < buffer.length) {
      char c = buffer[pos];
      switch (c) {
        case 'X': case 'x':
        case 'a': case 'A':
        case 'b': case 'B':
        case 'c': case 'C':
        case 'd': case 'D':
        case 'e': case 'E':
        case 'f': case 'F':
        case '0': case '1':
        case '2': case '3':
        case '4': case '5':
        case '6': case '7':
        case '8': case '9':
          pos++;
          break;
        default:
          return bufferSlice(oldPos, pos);
      }
    }
    // TODO(bazel-team): (2009) to do roundtripping when we evaluate the integer
    // constants, we must save the actual text of the tokens, not just their
    // integer value.

    return bufferSlice(oldPos, pos);
  }

  /**
   * Scans an integer literal.
   *
   * <p>ON ENTRY: 'pos' is 1 + the index of the first char in the literal.
   * ON EXIT: 'pos' is 1 + the index of the last char in the literal.
   *
   * @return the integer token.
   */
  private Token integer() {
    int oldPos = pos - 1;
    String literal = scanInteger();

    final String substring;
    final int radix;
    if (literal.startsWith("0x") || literal.startsWith("0X")) {
      radix = 16;
      substring = literal.substring(2);
    } else if (literal.startsWith("0") && literal.length() > 1) {
      radix = 8;
      substring = literal.substring(1);
    } else {
      radix = 10;
      substring = literal;
    }

    int value = 0;
    try {
      value = Integer.parseInt(substring, radix);
    } catch (NumberFormatException e) {
      error("invalid base-" + radix + " integer constant: " + literal);
    }

    return new Token(TokenKind.INT, oldPos, pos, value);
  }

  /**
   * Tokenizes a two-char operator.
   * @return true if it tokenized an operator
   */
  private boolean tokenizeTwoChars() {
    if (pos + 2 >= buffer.length) {
      return false;
    }
    char c1 = buffer[pos];
    char c2 = buffer[pos + 1];
    TokenKind tok = null;
    if (c2 == '=') {
      tok = EQUAL_TOKENS.get(c1);
    } else if (c2 == '*' && c1 == '*') {
      tok = TokenKind.STAR_STAR;
    }
    if (tok == null) {
      return false;
    } else {
      addToken(new Token(tok, pos, pos + 2));
      return true;
    }
  }

  /**
   * Performs tokenization of the character buffer of file contents provided to
   * the constructor.
   */
  private void tokenize() {
    while (pos < buffer.length) {
      if (tokenizeTwoChars()) {
        pos += 2;
        continue;
      }
      char c = buffer[pos];
      pos++;
      switch (c) {
      case '{': {
        addToken(new Token(TokenKind.LBRACE, pos - 1, pos));
        openParenStackDepth++;
        break;
      }
      case '}': {
        addToken(new Token(TokenKind.RBRACE, pos - 1, pos));
        popParen();
        break;
      }
      case '(': {
        addToken(new Token(TokenKind.LPAREN, pos - 1, pos));
        openParenStackDepth++;
        break;
      }
      case ')': {
        addToken(new Token(TokenKind.RPAREN, pos - 1, pos));
        popParen();
        break;
      }
      case '[': {
        addToken(new Token(TokenKind.LBRACKET, pos - 1, pos));
        openParenStackDepth++;
        break;
      }
      case ']': {
        addToken(new Token(TokenKind.RBRACKET, pos - 1, pos));
        popParen();
        break;
      }
      case '>': {
        addToken(new Token(TokenKind.GREATER, pos - 1, pos));
        break;
      }
      case '<': {
        addToken(new Token(TokenKind.LESS, pos - 1, pos));
        break;
      }
      case ':': {
        addToken(new Token(TokenKind.COLON, pos - 1, pos));
        break;
      }
      case ',': {
        addToken(new Token(TokenKind.COMMA, pos - 1, pos));
        break;
      }
      case '+': {
        addToken(new Token(TokenKind.PLUS, pos - 1, pos));
        break;
      }
      case '-': {
        addToken(new Token(TokenKind.MINUS, pos - 1, pos));
        break;
      }
      case '|': {
        addToken(new Token(TokenKind.PIPE, pos - 1, pos));
        break;
      }
      case '=': {
        addToken(new Token(TokenKind.EQUALS, pos - 1, pos));
        break;
      }
      case '%': {
        addToken(new Token(TokenKind.PERCENT, pos - 1, pos));
        break;
      }
      case '/': {
        addToken(new Token(TokenKind.SLASH, pos - 1, pos));
        break;
      }
      case ';': {
        addToken(new Token(TokenKind.SEMI, pos - 1, pos));
        break;
      }
      case '.': {
        addToken(new Token(TokenKind.DOT, pos - 1, pos));
        break;
      }
      case '*': {
        addToken(new Token(TokenKind.STAR, pos - 1, pos));
        break;
      }
      case ' ':
      case '\t':
      case '\r': {
        /* ignore */
        break;
      }
      case '\\': {
        // Backslash character is valid only at the end of a line (or in a string)
        if (pos + 1 < buffer.length && buffer[pos] == '\n') {
          pos += 1;  // skip the end of line character
        } else if (pos + 2 < buffer.length && buffer[pos] == '\r' && buffer[pos + 1] == '\n') {
          pos += 2;  // skip the CRLF at the end of line
        } else {
          addToken(new Token(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c)));
        }
        break;
      }
      case '\n': {
        newline();
        break;
      }
      case '#': {
        int oldPos = pos - 1;
        while (pos < buffer.length) {
          c = buffer[pos];
          if (c == '\n') {
            break;
          } else {
            pos++;
          }
        }
        addToken(new Token(TokenKind.COMMENT, oldPos, pos, bufferSlice(oldPos, pos)));
        break;
      }
      case '\'':
      case '\"': {
        addToken(stringLiteral(c, false));
        break;
      }
      default: {
        // detect raw strings, e.g. r"str"
        if (c == 'r' && pos < buffer.length
            && (buffer[pos] == '\'' || buffer[pos] == '\"')) {
          c = buffer[pos];
          pos++;
          addToken(stringLiteral(c, true));
          break;
        }

        if (Character.isDigit(c)) {
          addToken(integer());
        } else if (Character.isJavaIdentifierStart(c) && c != '$') {
          addToken(identifierOrKeyword());
        } else {
          error("invalid character: '" + c + "'");
        }
        break;
      } // default
      } // switch
    } // while

    if (indentStack.size() > 1) { // top of stack is always zero
      addToken(new Token(TokenKind.NEWLINE, pos - 1, pos));
      while (indentStack.size() > 1) {
        indentStack.pop();
        addToken(new Token(TokenKind.OUTDENT, pos - 1, pos));
      }
    }

    // Like Python, always end with a NEWLINE token, even if no '\n' in input:
    if (tokens.isEmpty() || Iterables.getLast(tokens).kind != TokenKind.NEWLINE) {
      addToken(new Token(TokenKind.NEWLINE, pos - 1, pos));
    }

    addToken(new Token(TokenKind.EOF, pos, pos));
  }

  /**
   * Returns the string at the current line, minus the new line.
   *
   * @param line the line from which to retrieve the String, 1-based
   * @return the text of the line
   */
  public String stringAtLine(int line) {
    Pair<Integer, Integer> offsets = locationInfo.lineNumberTable.getOffsetsForLine(line);
    return bufferSlice(offsets.first, offsets.second);
  }

  /**
   * Returns parts of the source buffer based on offsets
   *
   * @param start the beginning offset for the slice
   * @param end the offset immediately following the slice
   * @return the text at offset start with length end - start
   */
  private String bufferSlice(int start, int end) {
    return new String(this.buffer, start, end - start);
  }

}