ParserBase.java example

Explorer

DrawBridge-master
- src
  - com
    - drawbridge
    - google
      - caja
        ancillary
        linter
        ErrorReporter.java
        ExitModes.java
        FileContent.java
        LexicalScope.java
        LintHelper.java
        Linter.java
        LinterMessageType.java
        LiveSet.java
        NodeBuckets.java
        ScopeAnalyzer.java
        SymbolTable.java
        VariableLiveness.java
        package-info.java
        parser
        AbstractParseTreeNode.java
        AncestorChain.java
        ChildNodes.java
        MutableParseTreeNode.java
        Normalizer.java
        ParseTreeNode.java
        ParseTreeNodeContainer.java
        ParseTreeNodeVisitor.java
        ParseTreeNodes.java
        ParserBase.java
        ParserContext.java
        Visitor.java
        js
        AbstractExpression.java
        AbstractStatement.java
        ArrayConstructor.java
        AssignOperation.java
        Associativity.java
        Block.java
        BooleanLiteral.java
        BreakStmt.java
        CajoledModule.java
        CaseStmt.java
        CatchStmt.java
        Conditional.java
        ContinueStmt.java
        ControlOperation.java
        DebuggerStmt.java
        Declaration.java
        DefaultCaseStmt.java
        Directive.java
        DirectivePrologue.java
        DoWhileLoop.java
        Elision.java
        Expression.java
        ExpressionStmt.java
        FinallyStmt.java
        ForEachLoop.java
        ForLoop.java
        FormalParam.java
        FunctionConstructor.java
        FunctionDeclaration.java
        GetterProperty.java
        Identifier.java
        IntegerLiteral.java
        LabeledStatement.java
        LabeledStmtWrapper.java
        Literal.java
        Loop.java
        MultiDeclaration.java
        NestedScope.java
        NoChildren.java
        Noop.java
        NullLiteral.java
        NumberLiteral.java
        ObjProperty.java
        ObjectConstructor.java
        Operation.java
        Operator.java
        OperatorCategory.java
        OperatorType.java
        Parser.java
        RealLiteral.java
        Reference.java
        RegexpLiteral.java
        ReturnStmt.java
        SetterProperty.java
        SimpleOperation.java
        SpecialOperation.java
        Statement.java
        StringLiteral.java
        SwitchCase.java
        SwitchStmt.java
        SyntheticNodes.java
        ThrowStmt.java
        TranslatedCode.java
        TryStmt.java
        UncajoledModule.java
        ValueProperty.java
        WhileLoop.java
        WithStmt.java
        scope
        AbstractScope.java
        ES5ScopeAnalyzer.java
        JScriptScopeAnalyzer.java
        ScopeAnalyzer.java
        ScopeListener.java
        ScopeType.java
        WorstCaseScopeAnalyzer.java
        package-info.java
        util
        CajaTestCase.java
        TestUtil.java

// Copyright (C) 2005 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.google.caja.parser;

import com.google.caja.lexer.JsTokenQueue;
import com.google.caja.lexer.JsTokenType;
import com.google.caja.lexer.Keyword;
import com.google.caja.lexer.ParseException;
import com.google.caja.lexer.Token;
import com.google.caja.reporting.Message;
import com.google.caja.reporting.MessagePart;
import com.google.caja.reporting.MessageQueue;
import com.google.caja.reporting.MessageType;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Some identifier, keyword, and number handling routines used by multiple
 * parsers.
 *
 * @author mikesamuel@gmail.com
 * @author ihab.awad@gmail.com
 */
public abstract class ParserBase {
  protected final JsTokenQueue tq;
  protected final MessageQueue mq;
  protected final boolean isQuasiliteral;

  public ParserBase(JsTokenQueue tq, MessageQueue mq) {
    this(tq, mq, false);
  }

  public ParserBase(JsTokenQueue tq, MessageQueue mq, boolean isQuasiliteral) {
    this.tq = tq;
    this.mq = mq;
    this.isQuasiliteral = isQuasiliteral;
  }

  public JsTokenQueue getTokenQueue() { return tq; }
  public MessageQueue getMessageQueue() { return mq; }

  public String parseIdentifier(boolean allowReservedWords)
      throws ParseException {
    Token<JsTokenType> t = tq.peek();
    String s = t.text;
    switch (t.type) {
      case WORD:
        if (!allowReservedWords) {
          Keyword k = Keyword.fromString(decodeIdentifier(s));
          if (null != k) {
            mq.addMessage(MessageType.RESERVED_WORD_USED_AS_IDENTIFIER,
                tq.currentPosition(), k);
          }
        }
        if (!isIdentifier(s)) {
          throw new ParseException(
              new Message(MessageType.INVALID_IDENTIFIER,
                          tq.currentPosition(), MessagePart.Factory.valueOf(s))
              );
        }
        break;
      case KEYWORD:
        if (!allowReservedWords) {
          mq.addMessage(MessageType.RESERVED_WORD_USED_AS_IDENTIFIER,
                        tq.currentPosition(), Keyword.fromString(s));
        }
        break;
      default:
        throw new ParseException(
            new Message(MessageType.EXPECTED_TOKEN, tq.currentPosition(),
                        MessagePart.Factory.valueOf("an identifier"),
                        MessagePart.Factory.valueOf(s)));
    }
    tq.advance();
    return decodeIdentifier(s);
  }

  /**
   * String form of a regular expression that matches the javascript
   * IdentifierOrKeyword production, with extensions for quasiliteral
   * syntax.
   * <p>From section 7.6 of EcmaScript 262 Edition 3 (ES3), currently found at
   * http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
   * and based on http://www.erights.org/elang/grammar/quasi-overview.html
   * <pre>
   * <b>QuasiIdentifierOrKeyword</b> ->
   *       IdentifierOrKeyword
   *    |  QuasiliteralBegin IdentifierOrKeyword OptQuasiliteralQuantifier
   * <b>IdentifierOrKeyword</b> ->
   *       IdentifierName (but not Keyword)
   * <b>IdentifierName</b> ->
   *       IdentifierStart
   *    |  IdentifierName IdentifierPart
   * <b>IdentifierStart</b> ->
   *       UnicodeLetter  |  $  |  _  |  \ UnicodeEscapeSequence
   * <b>IdentifierPart</b> ->
   *       IdentifierStart  |  UnicodeCombiningMark  |  UnicodeDigit
   *    |  UnicodeConnectorPunctuation  |  \ UnicodeEscapeSequence
   * <b>UnicodeLetter</b> ->
   *       any character in the Unicode categories "Uppercase letter
   *       (Lu)", "Lowercase letter (Ll)", "Titlecase letter (Lt)",
   *       "Modifier letter (Lm)", "Other letter (Lo)", or "Letter
   *       number (Nl)".
   * <b>UnicodeCombiningMark</b> ->
   *       any character in the Unicode categories "Non-spacing mark (Mn)"
   *       or "Combining spacing mark (Mc)"
   * <b>UnicodeDigit</b> ->
   *       any character in the Unicode category "Decimal number (Nd)"
   * <b>UnicodeConnectorPunctuation</b> ->
   *       any character in the Unicode category "Connector punctuation (Pc)"
   * <b>UnicodeEscapeSequence</b> ->
   *       u HexDigit HexDigit HexDigit HexDigiti
   * <b>HexDigit</b> ->
   *       0  |  1  |  2  |  3  |  4  |  5  |  6  |  7  |  8  |  9  |  a
   *    |  b  |  c  |  d  |  e  |  f  |  A  |  B  |  C  |  D  |  E  |  F
   * <b>QuasiliteralBegin</b> ->
   *       '@'
   * <b>OptQuasiliteralQuantifier</b> ->
   *       ε
   *    |  '*'
   *    |  '+'
   *    |  '?'
   * </pre>
   * A <i>UnicodeEscapeSequence</i> cannot be used to put a character
   * into an identifier that would otherwise be illegal.
   */
  private static final Pattern IDENTIFIER_OR_KEYWORD_RE;
  private static final Pattern QUASI_IDENTIFIER_OR_KEYWORD_RE;
  private static final Pattern UNICODE_ESCAPE;  // hexDigits captured in group 1
  static {
    String hexDigit = "[0-9a-fA-F]";
    String letter = "\\p{javaLetter}";
    String letterOrDigit = "\\p{javaLetterOrDigit}";
    String combinerOrConnector = "\\p{Mn}\\p{Mc}\\p{Pc}";
    String identifierStart = "[" + letter + "$_]";
    String identifierPart = "[" + letterOrDigit + combinerOrConnector + "$_]";
    String quasiliteralBegin = "@";
    String optQuasiliteralQuantifier = "[\\+\\*\\?]?";
    String identifierOrKeyword = identifierStart + identifierPart + "*";
    IDENTIFIER_OR_KEYWORD_RE = Pattern.compile("^" + identifierOrKeyword + "$");
    String quasiIdentifierOrKeyword = (
        "(?:" + identifierOrKeyword + ")"
        + "|"
        + "(?:" + (
            quasiliteralBegin
            + identifierOrKeyword
            + optQuasiliteralQuantifier)
        + ")");
    QUASI_IDENTIFIER_OR_KEYWORD_RE = Pattern.compile(
        "^" + quasiIdentifierOrKeyword + "$");

    UNICODE_ESCAPE = Pattern.compile("\\\\u(" + hexDigit + "{4})");
  }

  public static boolean isJavascriptIdentifier(String s) {
    return IDENTIFIER_OR_KEYWORD_RE.matcher(decodeIdentifier(s)).matches()
        && Normalizer.isNormalized(s);
  }

  public static boolean isQuasiIdentifier(String s) {
    return QUASI_IDENTIFIER_OR_KEYWORD_RE.matcher(decodeIdentifier(s)).matches()
        && Normalizer.isNormalized(s);
  }

  public boolean isIdentifier(String s) {
    return (isQuasiliteral
            ? QUASI_IDENTIFIER_OR_KEYWORD_RE
            : IDENTIFIER_OR_KEYWORD_RE).matcher(decodeIdentifier(s)).matches()
        && Normalizer.isNormalized(s);
  }

  /**
   * Decodes escapes in an identifier to their literal codepoints so that
   * identifiers can be compared for equality via string equality of their
   * values.
   */
  public static String decodeIdentifier(String identifier) {
    // TODO(mikesamuel): is this true?
    // Javascript identifiers use a different escaping scheme from strings.
    // Specifically, \Uxxxxxxxx escapes handle extended unicode.  There are
    // 8 hex digits allowed even though extended unicode can't use more than
    // 6 of those.
    if (identifier.indexOf('\\') < 0) { return identifier; }
    StringBuffer sb = new StringBuffer();
    Matcher m = UNICODE_ESCAPE.matcher(identifier);
    while (m.find()) {
      m.appendReplacement(sb, "");
      sb.append((char) Integer.parseInt(m.group(1), 16));
    }
    m.appendTail(sb);
    return sb.toString();
  }
}