Rapids.java example

Explorer
h2o-3-master
package water.rapids;

import water.fvec.Frame;
import water.rapids.ast.AstExec;
import water.rapids.ast.AstFunction;
import water.rapids.ast.AstParameter;
import water.rapids.ast.AstRoot;
import water.rapids.ast.params.*;
import water.util.CollectionUtils;
import water.util.StringUtils;

import java.util.ArrayList;
import java.util.Map;
import java.util.Set;

/**
 * <p> Rapids is an interpreter of abstract syntax trees.
 *
 * <p> This file contains the AstRoot parser and parser helper functions.
 * AstRoot Execution starts in the AstExec file, but spreads throughout Rapids.
 *
 * <p> Trees have a Lisp-like structure with the following "reserved" special
 * characters:
 * <dl>
 *   <dt> '('   <dd> a nested function application expression till ')'
 *   <dt> '{'   <dd> a nested function definition  expression till '}'
 *   <dt> '['   <dd> a numeric or string list expression, till ']'
 *   <dt> '"'   <dd> a String (double quote)
 *   <dt> "'"   <dd> a String (single quote)
 *   <dt> digits: <dd> a number
 *   <dt> letters or other specials: <dd> an ID
 * </dl>
 *
 * <p> Variables are lexically scoped inside 'let' expressions or at the top-level
 * looked-up in the DKV directly (and must refer to a known type that is valid
 * on the execution stack).
 */
public class Rapids {
  private final String _str;  // Statement to parse and execute
  private int _x;             // Parse pointer, points to the index of the next character to be consumed

  /**
   * Parse a Rapids expression string into an Abstract Syntax Tree object.
   * @param rapids expression to parse
   */
  public static AstRoot parse(String rapids) {
    Rapids r = new Rapids(rapids);
    AstRoot res = r.parseNext();
    if (r.skipWS() != ' ')
      throw new IllegalASTException("Syntax error: illegal Rapids expression `" + rapids + "`");
    return res;
  }

  /**
   * Execute a single rapids call in a short-lived session
   * @param rapids expression to parse
   */
  public static Val exec(String rapids) {
    Session session = new Session();
    try {
      AstRoot ast = Rapids.parse(rapids);
      Val val = session.exec(ast, null);
      // Any returned Frame has it's REFCNT raised by +1, and the end(val) call
      // will account for that, copying Vecs as needed so that the returned
      // Frame is independent of the Session (which is disappearing).
      return session.end(val);
    } catch (Throwable ex) {
      throw session.endQuietly(ex);
    }
  }

  /**
   * Compute and return a value in this session.  Any returned frame shares
   * Vecs with the session (is not deep copied), and so must be deleted by the
   * caller (with a Rapids "rm" call) or will disappear on session exit, or is
   * a normal global frame.
   * @param rapids expression to parse
   */
  @SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter")
  public static Val exec(String rapids, Session session) {
    AstRoot ast = Rapids.parse(rapids);
    // Synchronize the session, to stop back-to-back overlapping Rapids calls
    // on the same session, which Flow sometimes does
    synchronized (session) {
      Val val = session.exec(ast, null);
      // Any returned Frame has it's REFCNT raised by +1, but is exiting the
      // session.  If it's a global, we simply need to lower the internal refcnts
      // (which won't delete on zero cnts because of the global).  If it's a
      // named temp, the ref cnts are accounted for by being in the temp table.
      if (val.isFrame()) {
        Frame frame = val.getFrame();
        assert frame._key != null : "Returned frame has no key";
        session.addRefCnt(frame, -1);
      }
      return val;
    }
  }


  //--------------------------------------------------------------------------------------------------------------------
  // Private
  //--------------------------------------------------------------------------------------------------------------------

  // Set of characters that cannot appear inside a token
  private static Set<Character> invalidTokenCharacters = StringUtils.toCharacterSet("({[]}) \t\r\n\\\"\'");

  // Set of characters that may appear in a number. Note that "NaN" or "nan" is also a number.
  private static Set<Character> validNumberCharacters = StringUtils.toCharacterSet("0123456789.-+eEnNaA");

  // List of all "simple" backslash-escape sequences (i.e. those that are only 2-characters long, i.e. '\n')
  private static Map<Character, Character> simpleEscapeSequences =
      CollectionUtils.createMap(StringUtils.toCharacterArray("ntrfb'\"\\"),
                                StringUtils.toCharacterArray("\n\t\r\f\b'\"\\"));


  /**
   * The constructor is private: rapids expression can be parsed into an AST tree, or executed, but the "naked" Rapids
   * object has no external purpose.
   * @param rapidsStr String containing a Rapids expression.
   */
  private Rapids(String rapidsStr) {
    _str = rapidsStr;
    _x = 0;
  }

  /**
   * Parse and return the next expression from the rapids string.
   * '('   a nested function application expression ')
   * '{'   a nested function definition  expression '}'
   * '['   a numeric list expression, till ']'
   * '"'   a String (double quote): attached_token
   * "'"   a String (single quote): attached_token
   * digits: a double
   * letters or other specials: an ID
   */
  private AstRoot parseNext() {
    switch (skipWS()) {
      case '(':  return parseFunctionApplication();
      case '{':  return parseFunctionDefinition();
      case '[':  return parseList();
      case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
        return new AstNum(number());
      case '-':  return (peek(1)>='0' && peek(1) <='9') ? new AstNum(number()) : new AstId(token());
      case '\"': case '\'':
        return new AstStr(string());
      case ' ':  throw new IllegalASTException("Expected an expression but ran out of text");
      default:  return new AstId(token());
    }
  }

  /**
   * Parse "function application" expression, i.e. pattern of the form "(func ...args)"
   */
  private AstExec parseFunctionApplication() {
    eatChar('(');
    ArrayList<AstRoot> asts = new ArrayList<>();
    while (skipWS() != ')')
      asts.add(parseNext());
    eatChar(')');
    AstExec res = new AstExec(asts);
    if (peek(0) == '-') {
      eatChar('-');
      eatChar('>');
      AstId tmpid = new AstId(token());
      res = new AstExec(new AstRoot[]{new AstId("tmp="), tmpid, res});
    }
    return res;
  }

  /**
   * Parse and return a user defined function of the form "{arg1 arg2 . (expr)}"
   */
  private AstFunction parseFunctionDefinition() {
    eatChar('{');

    // Parse the list of ids
    ArrayList<String> ids = new ArrayList<>();
    ids.add("");  // 1-based ID list
    while (skipWS() != '.') {
      String id = token();
      if (!Character.isJavaIdentifierStart(id.charAt(0)))
        throw new IllegalASTException("variable must be a valid Java identifier: " + id);
      for (char c : id.toCharArray())
        if (!Character.isJavaIdentifierPart(c))
          throw new IllegalASTException("variable must be a valid Java identifier: " + id);
      ids.add(id);
    }

    // Single dot separates the list of ids from the body of the function
    eatChar('.');

    // Parse the body
    AstRoot body = parseNext();
    if (skipWS() != '}')
      throw new IllegalASTException("Expected the end of the function, but found '" + peek(0) + "'");
    eatChar('}');

    return new AstFunction(ids, body);
  }

  /**
   * Parse and return a list of tokens: either a list of strings, or a list of numbers.
   * We do not support lists of mixed types, or lists containing variables (for now).
   */
  private AstParameter parseList() {
    eatChar('[');
    char nextChar = skipWS();
    AstParameter res = isQuote(nextChar)? parseStringList() : parseNumList();
    eatChar(']');
    return res;
  }

  /**
   * Parse a list of strings. Strings can be either in single- or in double quotes.
   */
  private AstStrList parseStringList() {
    ArrayList<String> strs = new ArrayList<>(10);
    while (isQuote(skipWS())) {
      strs.add(string());
      if (skipWS() == ',') eatChar(',');
    }
    return new AstStrList(strs);
  }

  /**
   * Parse a "num list". This could be either a plain list of numbers, or a range, or a list of ranges. For example
   * [2 3 4 5 6 7] can also be written as [2:6] or [2:2 4:4:1]. The format of each "range" is `start:count[:stride]`,
   * and it denotes the sequence {start, start + stride, ..., start + (count-1)*stride}. Here start and stride may
   * be real numbers, however count must be a non-negative integer. Negative strides are also not allowed.
   */
  private AstNumList parseNumList() {
    ArrayList<Double> bases = new ArrayList<>();
    ArrayList<Double> strides = new ArrayList<>();
    ArrayList<Long> counts = new ArrayList<>();

    while (skipWS() != ']') {
      double base = number();
      double count = 1;
      double stride = 1;
      if (skipWS() == ':') {
        eatChar(':');
        skipWS();
        count = number();
        if (count < 1 || ((long) count) != count)
          throw new IllegalASTException("Count must be a positive integer, got " + count);
      }
      if (skipWS() == ':') {
        eatChar(':');
        skipWS();
        stride = number();
        if (stride < 0 || Double.isNaN(stride))
          throw new IllegalASTException("Stride must be positive, got " + stride);
      }
      if (count == 1 && stride != 1)
        throw new IllegalASTException("If count is 1, then stride must be one (and ignored)");
      bases.add(base);
      counts.add((long) count);
      strides.add(stride);
      // Optional comma separating span
      if (skipWS() == ',') eatChar(',');
    }

    return new AstNumList(bases, strides, counts);
  }

  /**
   * Return the character at the current parse position (or `offset` chars in the future), without advancing it.
   * If there are no more characters to peek, return ' '.
   */
  private char peek(int offset) {
    return _x + offset < _str.length() ? _str.charAt(_x + offset) : ' ';
  }

  /**
   * Consume the next character from the parse stream, throwing an exception if it is not `c`.
   */
  private void eatChar(char c) {
    if (peek(0) != c)
      throw new IllegalASTException("Expected '" + c + "'. Got: '" + peek(0));
    _x++;
  }

  /**
   * Advance parse pointer to the first non-whitespace character, and return that character.
   * If such non-whitespace character cannot be found, then return ' '.
   */
  private char skipWS() {
    char c = ' ';
    while (_x < _str.length() && isWS(c = peek(0))) _x++;
    return c;
  }

  /**
   * Parse a "token" from the input stream. A token is terminated by the next whitespace, or any of the
   * following characters: )}],:
   *
   * NOTE: our notion of "token" is very permissive. We may want to restrict it in the future...
   */
  private String token() {
    int start = _x;
    while (!invalidTokenCharacters.contains(peek(0))) _x++;
    if (start == _x) throw new IllegalASTException("Missing token");
    return _str.substring(start, _x);
  }

  /**
   * Parse a number from the token stream.
   */
  private double number() {
    int start = _x;
    while (validNumberCharacters.contains(peek(0))) _x++;
    if (start == _x) throw new IllegalASTException("Missing a number");
    String s = _str.substring(start, _x);
    if (s.toLowerCase().equals("nan")) return Double.NaN;
    try {
      return Double.valueOf(s);
    } catch (NumberFormatException e) {
      throw new IllegalASTException(e.toString());
    }
  }

  /**
   * Parse a string from the token stream.
   */
  private String string() {
    char quote = peek(0);
    int start = ++_x;
    boolean has_escapes = false;
    while (_x < _str.length()) {
      char c = peek(0);
      if (c == '\\') {
        has_escapes = true;
        char cc = peek(1);
        if (simpleEscapeSequences.containsKey(cc)) {
          _x += 2;
        } else if (cc == 'x') {
          _x += 4;   // e.g: \x5A
        } else if (cc == 'u') {
          _x += 6;   // e.g: \u1234
        } else if (cc == 'U') {
          _x += 10;  // e.g: \U0010FFFF
        } else
          throw new IllegalASTException("Invalid escape sequence \\" + cc);
      } else if (c == quote) {
        _x++;
        if (has_escapes) {
          StringBuilder sb = new StringBuilder();
          for (int i = start; i < _x - 1; i++) {
            char ch = _str.charAt(i);
            if (ch == '\\') {
              char cc = _str.charAt(++i);
              if (simpleEscapeSequences.containsKey(cc)) {
                sb.append(simpleEscapeSequences.get(cc));
              } else {
                int n = (cc == 'x')? 2 : (cc == 'u')? 4 : (cc == 'U')? 8 : -1;
                int hex = -1;
                try {
                  hex = StringUtils.unhex(_str.substring(i + 1, i + 1 + n));
                } catch (NumberFormatException e) {
                  throw new IllegalASTException(e.toString());
                }
                if (hex > 0x10FFFF)
                  throw new IllegalASTException("Illegal unicode codepoint " + hex);
                sb.append(Character.toChars(hex));
                i += n;
              }
            } else {
              sb.append(ch);
            }
          }
          return sb.toString();
        } else {
          return _str.substring(start, _x - 1);
        }
      } else {
        _x++;
      }
    }
    throw new IllegalASTException("Unterminated string at " + start);
  }

  /**
   * Return true if `c` is a whitespace character.
   */
  private static boolean isWS(char c) {
    return c == ' ' || c == '\t' || c == '\n' || c == '\r';
  }

  /**
   * Return true if `c` is a quote character.
   */
  private static boolean isQuote(char c) {
    return c == '\'' || c == '\"';
  }


  // Return unparsed text, useful in error messages and debugging
  // private String unparsed() {
  //   return _str.substring(_x, _str.length());
  // }

  //  public AstRoot throwErr(String msg) {
  //    int idx = _str.length() - 1;
  //    int lo = _x, hi = idx;
  //
  //    if (idx < lo) {
  //      lo = idx;
  //      hi = lo;
  //    }
  //    String s = msg + '\n' + _str + '\n';
  //    int i;
  //    for (i = 0; i < lo; i++) s += ' ';
  //    s += '^';
  //    i++;
  //    for (; i < hi; i++) s += '-';
  //    if (i <= hi) s += '^';
  //    s += '\n';
  //    throw new IllegalASTException(s);
  //  }

  public static class IllegalASTException extends IllegalArgumentException {
    public IllegalASTException(String s) {
      super(s);
    }
  }
}