Tokenizer.java example

Explorer
cloud-odata-java-master
/*******************************************************************************
 * Copyright 2013 SAP AG
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package com.sap.core.odata.core.uri.expression;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.sap.core.odata.api.edm.EdmLiteral;
import com.sap.core.odata.api.edm.EdmLiteralException;
import com.sap.core.odata.api.edm.EdmSimpleTypeFacade;
import com.sap.core.odata.api.edm.EdmSimpleTypeKind;
import com.sap.core.odata.api.uri.expression.ExpressionParserException;
import com.sap.core.odata.core.edm.EdmSimpleTypeFacadeImpl;

/**
 * Expression tokenizer
 * @author SAP AG
 */
public class Tokenizer {

  //Pattern OTHER_LIT = Pattern.compile("^([[A-Za-z0-9]._~%!$&*+;:@-]+)");
  private static final Pattern OTHER_LIT = Pattern.compile("(?:\\p{L}|\\p{Digit}|[-._~%!$&*+;:@])+");
  private static final Pattern FUNK = Pattern.compile("^(startswith|endswith|substring|substring|substringof|indexof|replace|tolower|toupper|trim|concat|length|year|mounth|day|hour|minute|second|round|ceiling|floor)( *)\\(");
  private static final Pattern AND_SUB1 = Pattern.compile("^(add|sub|mul|div|mod|not) ");
  private static final Pattern AND_SUB = Pattern.compile("^(and|or|eq|ne|lt|gt|le|ge) ");
  private static final Pattern prefix = Pattern.compile("^(X|binary|guid|datetime|datetimeoffset|time)'");
  private boolean flagIncludeWhitespace = false;
  private EdmSimpleTypeFacade typeDectector = null;

  int curPosition;
  final String expression;
  final int expressionLength;
  TokenList tokens;

  public Tokenizer(final String expression) {
    typeDectector = new EdmSimpleTypeFacadeImpl();
    this.expression = expression;
    expressionLength = expression.length();
    tokens = new TokenList();
  }

  /**
   * Inform the Tokenizer whether extra tokens for whitespace characters should be added to the token list or not.
   * @param flagIncludeWhitespace True -> Whitespace token will be added to token list; False otherwise
   * @return this
   */
  public Tokenizer setFlagWhiteSpace(final Boolean flagIncludeWhitespace) {
    this.flagIncludeWhitespace = flagIncludeWhitespace;
    return this;
  }

  /**
   * Tokenizes an expression as defined per OData specification 
   * @return Token list 
   */
  public TokenList tokenize() throws TokenizerException, ExpressionParserException {
    curPosition = 0;
    int oldPosition;
    char curCharacter;
    String token = "";

    while (curPosition < expressionLength) {
      oldPosition = curPosition;

      curCharacter = expression.charAt(curPosition);
      switch (curCharacter) {
      case ' ':
        //count whitespace and move pointer to next non-whitespace char
        eatWhiteSpaces(curPosition, curCharacter);
        break;

      case '(':
        tokens.appendToken(curPosition, TokenKind.OPENPAREN, curCharacter);
        curPosition = curPosition + 1;

        break;

      case ')':
        tokens.appendToken(curPosition, TokenKind.CLOSEPAREN, curCharacter);
        curPosition = curPosition + 1;
        break;

      case '\'':
        token = "";
        readLiteral(curCharacter);

        break;

      case ',':
        tokens.appendToken(oldPosition, TokenKind.COMMA, curCharacter);
        curPosition = curPosition + 1;
        break;

      case '=':
      case '/':
      case '?':
      case '.':
      case '*':
        curPosition = curPosition + 1;
        tokens.appendToken(oldPosition, TokenKind.SYMBOL, curCharacter);
        break;

      default:
        String rem_expr = expression.substring(curPosition); //remaining expression

        boolean isBinary = checkForBinary(oldPosition, rem_expr);
        if (isBinary) {
          break;
        }

        //check for prefixes like X, binary, guid, datetime
        boolean isPrefix = checkForPrefix(rem_expr);
        if (isPrefix) {
          break;
        }

        //check for math
        boolean isMath = checkForMath(oldPosition, rem_expr);
        if (isMath) {
          break;
        }

        //check for function
        boolean isFunction = checkForMethod(oldPosition, rem_expr);
        if (isFunction) {
          break;
        }

        boolean isBoolean = checkForBoolean(oldPosition, rem_expr);
        if (isBoolean) {
          break;
        }

        boolean isLiteral = checkForLiteral(oldPosition, curCharacter, rem_expr);
        if (isLiteral) {
          break;
        }

        token = new Character(curCharacter).toString();
        throw TokenizerException.createUNKNOWN_CHARACTER(oldPosition, token, expression);
      }
    }
    return tokens;
  }

  private boolean checkForLiteral(final int oldPosition, final char curCharacter, final String rem_expr) {
    final Matcher matcher = OTHER_LIT.matcher(rem_expr);
    boolean isLiteral = false;
    if (matcher.lookingAt()) {
      String token = matcher.group();
      try {
        EdmLiteral edmLiteral = typeDectector.parseUriLiteral(token);
        curPosition = curPosition + token.length();
        // It is a simple type.
        tokens.appendEdmTypedToken(oldPosition, TokenKind.SIMPLE_TYPE, token, edmLiteral);
        isLiteral = true;
      } catch (EdmLiteralException e) {
        // We treat it as normal untyped literal. 

        // The '-' is checked here (and not in the switch statement) because it may be
        // part of a negative number.
        if (curCharacter == '-') {
          curPosition = curPosition + 1;
          tokens.appendToken(oldPosition, TokenKind.SYMBOL, curCharacter);
          isLiteral = true;
        } else {
          curPosition = curPosition + token.length();
          tokens.appendToken(oldPosition, TokenKind.LITERAL, token);
          isLiteral = true;
        }
      }
    }
    return isLiteral;
  }

  private boolean checkForBoolean(final int oldPosition, final String rem_expr) {
    boolean isBoolean = false;
    if (rem_expr.equals("true") || rem_expr.equals("false")) {
      curPosition = curPosition + rem_expr.length();
      tokens.appendEdmTypedToken(oldPosition, TokenKind.SIMPLE_TYPE, rem_expr, new EdmLiteral(EdmSimpleTypeFacadeImpl.getEdmSimpleType(EdmSimpleTypeKind.Boolean), rem_expr));
      isBoolean = true;
    }
    return isBoolean;
  }

  private void eatWhiteSpaces(final int oldPosition, char curCharacter) {
    int lv_token_len;
    String expression_sub;
    while ((curCharacter == ' ') && (curPosition < expressionLength)) {
      curPosition = curPosition + 1;
      if (curPosition < expressionLength) {
        curCharacter = expression.charAt(curPosition);
      }
    }

    lv_token_len = curPosition - oldPosition;

    if (flagIncludeWhitespace == true) {
      expression_sub = expression.substring(oldPosition, oldPosition + lv_token_len);
      tokens.appendEdmTypedToken(oldPosition, TokenKind.WHITESPACE, expression_sub, null);
    }
  }

  private boolean checkForMethod(final int oldPosition, final String rem_expr) {
    boolean isMethod = false;
    Matcher matcher = FUNK.matcher(rem_expr);
    if (matcher.find()) {
      String token = matcher.group(1);
      curPosition = curPosition + token.length();
      tokens.appendToken(oldPosition, TokenKind.LITERAL, token);
      isMethod = true;
    }
    return isMethod;
  }

  private boolean checkForMath(final int oldPosition, final String rem_expr) {
    boolean isMath = false;
    Matcher matcher1 = AND_SUB1.matcher(rem_expr);
    if (matcher1.find()) {
      String token = matcher1.group(1);
      curPosition = curPosition + token.length();
      tokens.appendToken(oldPosition, TokenKind.LITERAL, token);
      isMath = true;
    }
    return isMath;
  }

  private boolean checkForBinary(final int oldPosition, final String rem_expr) {
    boolean isBinary = false;
    Matcher matcher1 = AND_SUB.matcher(rem_expr);
    if (matcher1.find()) {
      String token = matcher1.group(1);
      curPosition = curPosition + token.length();
      tokens.appendToken(oldPosition, TokenKind.LITERAL, token);
      isBinary = true;
    }
    return isBinary;
  }

  private boolean checkForPrefix(final String rem_expr) throws ExpressionParserException, TokenizerException {
    boolean isPrefix = false;
    Matcher matcher = prefix.matcher(rem_expr);
    String token = "";
    char curCharacter;

    if (matcher.find()) {
      token = matcher.group(1);
      curPosition = curPosition + token.length();
      curCharacter = expression.charAt(curPosition); //"should  be '
      readLiteral(curCharacter, token);
      isPrefix = true;
    }
    return isPrefix;
  }

  private void readLiteral(final char curCharacter) throws ExpressionParserException, TokenizerException {
    readLiteral(curCharacter, "");
  }

  /**
   * Read up to single ' and move pointer to the following char and tries a type detection
   * @param curCharacter
   * @param token
   * @throws ExpressionParserException
   * @throws TokenizerException
   */
  private void readLiteral(char curCharacter, String token) throws ExpressionParserException, TokenizerException {
    int offsetPos = -token.length();
    int oldPosition = curPosition;
    token = token + Character.toString(curCharacter);
    curPosition = curPosition + 1;

    boolean wasApostroph = false; //leading ' does not count
    while (curPosition < expressionLength) {
      curCharacter = expression.charAt(curPosition);

      if (curCharacter != '\'') {
        if (wasApostroph == true) {
          break;
        }

        token = token + curCharacter;
        wasApostroph = false;
      } else {
        if (wasApostroph) {
          wasApostroph = false; //a double ' is a normal character '
        } else {
          wasApostroph = true;
          token = token + curCharacter;
        }
      }
      curPosition = curPosition + 1;
    }

    if (!wasApostroph) {
      //Exception tested within TestPMparseFilterString
      throw FilterParserExceptionImpl.createTOKEN_UNDETERMINATED_STRING(oldPosition, expression);
    }

    try {
      EdmLiteral edmLiteral = typeDectector.parseUriLiteral(token);
      tokens.appendEdmTypedToken(oldPosition + offsetPos, TokenKind.SIMPLE_TYPE, token, edmLiteral);
    } catch (EdmLiteralException ex) {
      throw TokenizerException.createTYPEDECTECTION_FAILED_ON_STRING(ex, oldPosition, token);
    }
  }
}