Lexer.java example

Explorer
platform_build-master
- buck-master
/*
 * Copyright 2015-present Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License. You may obtain
 * a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 */

// Copyright 2014 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.facebook.buck.query;

import com.google.common.base.Preconditions;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;

/**
 * A tokenizer for the Buck query language.
 *
 * <p>Note, we can avoid a lot of quoting by noting that the characters [() ,] do not appear in any
 * label, filename, function name, or regular expression we care about.
 *
 * <p>No string escapes are allowed ("\"). Given the domain, that's not currently a problem.
 */
final class Lexer {

  /** Discriminator for different kinds of tokens. */
  public enum TokenKind {
    WORD("word"),
    EOF("EOF"),

    COMMA(","),
    EQUALS("="),
    LPAREN("("),
    MINUS("-"),
    PLUS("+"),
    RPAREN(")"),
    CARET("^"),

    __ALL_IDENTIFIERS_FOLLOW(""), // See below

    IN("in"),
    LET("let"),
    SET("set"),

    INTERSECT("intersect"),
    EXCEPT("except"),
    UNION("union");

    private final String prettyName;

    private TokenKind(String prettyName) {
      this.prettyName = prettyName;
    }

    public String getPrettyName() {
      return prettyName;
    }
  }

  public static final Set<TokenKind> BINARY_OPERATORS =
      EnumSet.of(
          TokenKind.INTERSECT,
          TokenKind.CARET,
          TokenKind.UNION,
          TokenKind.PLUS,
          TokenKind.EXCEPT,
          TokenKind.MINUS);

  private static final Map<String, TokenKind> keywordMap = new HashMap<>();

  static {
    for (TokenKind kind : EnumSet.allOf(TokenKind.class)) {
      if (kind.ordinal() > TokenKind.__ALL_IDENTIFIERS_FOLLOW.ordinal()) {
        keywordMap.put(kind.getPrettyName(), kind);
      }
    }
  }

  /** Returns true iff 'word' is a reserved word of the language. */
  static boolean isReservedWord(String word) {
    return keywordMap.containsKey(word);
  }

  /** Tokens returned by the Lexer. */
  static class Token {

    public final TokenKind kind;

    @Nullable public final String word;

    Token(TokenKind kind) {
      this.kind = kind;
      this.word = null;
    }

    Token(String word) {
      this.kind = TokenKind.WORD;
      this.word = word;
    }

    @Override
    public boolean equals(Object other) {
      return (other instanceof Token) && equalTo((Token) other);
    }

    boolean equalTo(Token other) {
      return this.kind.equals(other.kind)
          && ((word != null && word.equals(other.word)) || (word == null && other.word == null));
    }

    @Override
    public int hashCode() {
      int h = 31;
      h = h * 17 + kind.hashCode();
      if (word != null) {
        h = h * 17 + word.hashCode();
      }
      return h;
    }

    @Override
    public String toString() {
      return kind == TokenKind.WORD ? Preconditions.checkNotNull(word) : kind.getPrettyName();
    }
  }

  /**
   * Entry point to the lexer. Returns the list of tokens for the specified input, or throws
   * QueryException.
   */
  public static List<Token> scan(char[] buffer) throws QueryException {
    Lexer lexer = new Lexer(buffer);
    lexer.tokenize();
    return lexer.tokens;
  }

  // Input buffer and position
  @Nullable private char[] buffer;
  private int pos;

  private final List<Token> tokens = new ArrayList<>();

  private Lexer(char[] buffer) {
    this.buffer = buffer;
    this.pos = 0;
  }

  private void addToken(Token s) {
    tokens.add(s);
  }

  /**
   * Scans a quoted word delimited by 'quot'.
   *
   * <p>ON ENTRY: 'pos' is 1 + the index of the first delimiter ON EXIT: 'pos' is 1 + the index of
   * the last delimiter.
   *
   * @return the word token.
   */
  private Token quotedWord(char quot) throws QueryException {
    int oldPos = pos - 1;
    Preconditions.checkNotNull(buffer);
    while (pos < buffer.length) {
      char c = buffer[pos++];
      switch (c) {
        case '\'':
        case '"':
          if (c == quot) {
            // close-quote, all done.
            return new Token(bufferSlice(oldPos + 1, pos - 1));
          }
      }
    }
    throw new QueryException("unclosed quotation");
  }

  private TokenKind getTokenKindForWord(String word) {
    TokenKind kind = keywordMap.get(word);
    return kind == null ? TokenKind.WORD : kind;
  }

  // Unquoted words may contain [-*$], but not start with them.  For user convenience, unquoted
  // words must include UNIX filenames, labels and target label patterns, and simple regexps
  // (e.g. cc_.*). Keep consistent with TargetLiteral.toString()!
  private String scanWord() {
    Preconditions.checkNotNull(buffer);
    int oldPos = pos - 1;
    while (pos < buffer.length) {
      switch (buffer[pos]) {
        case 'a':
        case 'b':
        case 'c':
        case 'd':
        case 'e':
        case 'f':
        case 'g':
        case 'h':
        case 'i':
        case 'j':
        case 'k':
        case 'l':
        case 'm':
        case 'n':
        case 'o':
        case 'p':
        case 'q':
        case 'r':
        case 's':
        case 't':
        case 'u':
        case 'v':
        case 'w':
        case 'x':
        case 'y':
        case 'z':
        case 'A':
        case 'B':
        case 'C':
        case 'D':
        case 'E':
        case 'F':
        case 'G':
        case 'H':
        case 'I':
        case 'J':
        case 'K':
        case 'L':
        case 'M':
        case 'N':
        case 'O':
        case 'P':
        case 'Q':
        case 'R':
        case 'S':
        case 'T':
        case 'U':
        case 'V':
        case 'W':
        case 'X':
        case 'Y':
        case 'Z':
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
        case '*':
        case '/':
        case '@':
        case '.':
        case '-':
        case '_':
        case ':':
        case '$':
        case '#':
          pos++;
          break;
        default:
          return bufferSlice(oldPos, pos);
      }
    }
    return bufferSlice(oldPos, pos);
  }

  /**
   * Scans a word or keyword.
   *
   * <p>ON ENTRY: 'pos' is 1 + the index of the first char in the word. ON EXIT: 'pos' is 1 + the
   * index of the last char in the word.
   *
   * @return the word or keyword token.
   */
  private Token wordOrKeyword() {
    String word = scanWord();
    TokenKind kind = getTokenKindForWord(word);
    return kind == TokenKind.WORD ? new Token(word) : new Token(kind);
  }

  /** Performs tokenization of the character buffer of file contents provided to the constructor. */
  private void tokenize() throws QueryException {
    Preconditions.checkNotNull(buffer);
    while (pos < buffer.length) {
      char c = buffer[pos];
      pos++;
      switch (c) {
        case '(':
          {
            addToken(new Token(TokenKind.LPAREN));
            break;
          }
        case ')':
          {
            addToken(new Token(TokenKind.RPAREN));
            break;
          }
        case ',':
          {
            addToken(new Token(TokenKind.COMMA));
            break;
          }
        case '+':
          {
            addToken(new Token(TokenKind.PLUS));
            break;
          }
        case '-':
          {
            addToken(new Token(TokenKind.MINUS));
            break;
          }
        case '=':
          {
            addToken(new Token(TokenKind.EQUALS));
            break;
          }
        case '^':
          {
            addToken(new Token(TokenKind.CARET));
            break;
          }
        case '\n':
        case ' ':
        case '\t':
        case '\r':
          {
            /* ignore */
            break;
          }
        case '\'':
        case '\"':
          {
            addToken(quotedWord(c));
            break;
          }
        default:
          {
            addToken(wordOrKeyword());
            break;
          } // default
      } // switch
    } // while

    addToken(new Token(TokenKind.EOF));

    this.buffer = null; // release buffer now that we have our tokens
  }

  private String bufferSlice(int start, int end) {
    return new String(this.buffer, start, end - start);
  }
}