/*
* Copyright 2015-present Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/
// Copyright 2014 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.facebook.buck.query;
import com.google.common.base.Preconditions;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
/**
* A tokenizer for the Buck query language.
*
* <p>Note, we can avoid a lot of quoting by noting that the characters [() ,] do not appear in any
* label, filename, function name, or regular expression we care about.
*
* <p>No string escapes are allowed ("\"). Given the domain, that's not currently a problem.
*/
final class Lexer {
/** Discriminator for different kinds of tokens. */
public enum TokenKind {
WORD("word"),
EOF("EOF"),
COMMA(","),
EQUALS("="),
LPAREN("("),
MINUS("-"),
PLUS("+"),
RPAREN(")"),
CARET("^"),
__ALL_IDENTIFIERS_FOLLOW(""), // See below
IN("in"),
LET("let"),
SET("set"),
INTERSECT("intersect"),
EXCEPT("except"),
UNION("union");
private final String prettyName;
private TokenKind(String prettyName) {
this.prettyName = prettyName;
}
public String getPrettyName() {
return prettyName;
}
}
public static final Set<TokenKind> BINARY_OPERATORS =
EnumSet.of(
TokenKind.INTERSECT,
TokenKind.CARET,
TokenKind.UNION,
TokenKind.PLUS,
TokenKind.EXCEPT,
TokenKind.MINUS);
private static final Map<String, TokenKind> keywordMap = new HashMap<>();
static {
for (TokenKind kind : EnumSet.allOf(TokenKind.class)) {
if (kind.ordinal() > TokenKind.__ALL_IDENTIFIERS_FOLLOW.ordinal()) {
keywordMap.put(kind.getPrettyName(), kind);
}
}
}
/** Returns true iff 'word' is a reserved word of the language. */
static boolean isReservedWord(String word) {
return keywordMap.containsKey(word);
}
/** Tokens returned by the Lexer. */
static class Token {
public final TokenKind kind;
@Nullable public final String word;
Token(TokenKind kind) {
this.kind = kind;
this.word = null;
}
Token(String word) {
this.kind = TokenKind.WORD;
this.word = word;
}
@Override
public boolean equals(Object other) {
return (other instanceof Token) && equalTo((Token) other);
}
boolean equalTo(Token other) {
return this.kind.equals(other.kind)
&& ((word != null && word.equals(other.word)) || (word == null && other.word == null));
}
@Override
public int hashCode() {
int h = 31;
h = h * 17 + kind.hashCode();
if (word != null) {
h = h * 17 + word.hashCode();
}
return h;
}
@Override
public String toString() {
return kind == TokenKind.WORD ? Preconditions.checkNotNull(word) : kind.getPrettyName();
}
}
/**
* Entry point to the lexer. Returns the list of tokens for the specified input, or throws
* QueryException.
*/
public static List<Token> scan(char[] buffer) throws QueryException {
Lexer lexer = new Lexer(buffer);
lexer.tokenize();
return lexer.tokens;
}
// Input buffer and position
@Nullable private char[] buffer;
private int pos;
private final List<Token> tokens = new ArrayList<>();
private Lexer(char[] buffer) {
this.buffer = buffer;
this.pos = 0;
}
private void addToken(Token s) {
tokens.add(s);
}
/**
* Scans a quoted word delimited by 'quot'.
*
* <p>ON ENTRY: 'pos' is 1 + the index of the first delimiter ON EXIT: 'pos' is 1 + the index of
* the last delimiter.
*
* @return the word token.
*/
private Token quotedWord(char quot) throws QueryException {
int oldPos = pos - 1;
Preconditions.checkNotNull(buffer);
while (pos < buffer.length) {
char c = buffer[pos++];
switch (c) {
case '\'':
case '"':
if (c == quot) {
// close-quote, all done.
return new Token(bufferSlice(oldPos + 1, pos - 1));
}
}
}
throw new QueryException("unclosed quotation");
}
private TokenKind getTokenKindForWord(String word) {
TokenKind kind = keywordMap.get(word);
return kind == null ? TokenKind.WORD : kind;
}
// Unquoted words may contain [-*$], but not start with them. For user convenience, unquoted
// words must include UNIX filenames, labels and target label patterns, and simple regexps
// (e.g. cc_.*). Keep consistent with TargetLiteral.toString()!
private String scanWord() {
Preconditions.checkNotNull(buffer);
int oldPos = pos - 1;
while (pos < buffer.length) {
switch (buffer[pos]) {
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z':
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '*':
case '/':
case '@':
case '.':
case '-':
case '_':
case ':':
case '$':
case '#':
pos++;
break;
default:
return bufferSlice(oldPos, pos);
}
}
return bufferSlice(oldPos, pos);
}
/**
* Scans a word or keyword.
*
* <p>ON ENTRY: 'pos' is 1 + the index of the first char in the word. ON EXIT: 'pos' is 1 + the
* index of the last char in the word.
*
* @return the word or keyword token.
*/
private Token wordOrKeyword() {
String word = scanWord();
TokenKind kind = getTokenKindForWord(word);
return kind == TokenKind.WORD ? new Token(word) : new Token(kind);
}
/** Performs tokenization of the character buffer of file contents provided to the constructor. */
private void tokenize() throws QueryException {
Preconditions.checkNotNull(buffer);
while (pos < buffer.length) {
char c = buffer[pos];
pos++;
switch (c) {
case '(':
{
addToken(new Token(TokenKind.LPAREN));
break;
}
case ')':
{
addToken(new Token(TokenKind.RPAREN));
break;
}
case ',':
{
addToken(new Token(TokenKind.COMMA));
break;
}
case '+':
{
addToken(new Token(TokenKind.PLUS));
break;
}
case '-':
{
addToken(new Token(TokenKind.MINUS));
break;
}
case '=':
{
addToken(new Token(TokenKind.EQUALS));
break;
}
case '^':
{
addToken(new Token(TokenKind.CARET));
break;
}
case '\n':
case ' ':
case '\t':
case '\r':
{
/* ignore */
break;
}
case '\'':
case '\"':
{
addToken(quotedWord(c));
break;
}
default:
{
addToken(wordOrKeyword());
break;
} // default
} // switch
} // while
addToken(new Token(TokenKind.EOF));
this.buffer = null; // release buffer now that we have our tokens
}
private String bufferSlice(int start, int end) {
return new String(this.buffer, start, end - start);
}
}