/******************************************************************************* * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved. * This program and the accompanying materials are made available under the * terms of the Eclipse Public License v1.0 and Eclipse Distribution License v. 1.0 * which accompanies this distribution. * The Eclipse Public License is available at http://www.eclipse.org/legal/epl-v10.html * and the Eclipse Distribution License is available at * http://www.eclipse.org/org/documents/edl-v10.php. * * Contributors: * Oracle - initial API and implementation * ******************************************************************************/ package org.eclipse.persistence.jpa.jpql; /** * This "parser/scanner" holds onto the string version of the JPQL query that is parsed into a * parsed tree. It uses a cursor that lets the current {@link * org.eclipse.persistence.jpa.jpql.parser.Expression Expression} object to parse its fragment of the query. * <p> * Provisional API: This interface is part of an interim API that is still under development and * expected to change significantly before reaching stability. It is available at this early stage * to solicit feedback from pioneering adopters on the understanding that any code that uses this * API will almost certainly be broken (repeatedly) as the API evolves. * * @version 2.5.1 * @since 2.3 * @author Pascal Filion */ public final class WordParser { /** * The current position of the cursor within the JPQL query. */ private int cursor; /** * The length of the JPQL query. */ private final int length; /** * The string representation of the JPQL query. */ private final CharSequence text; /** * When {@link WordParser#word()}, {@link WordParser#wordEndPosition()}, or {@link * WordParser#wordEndPosition(int)} is called, this is updated to reflect the type of word that * is scanned. * * @since 2.4 */ private WordType wordType; /** * Creates a new <code>WordParser</code>. * * @param text The string representation of the JPQL query */ public WordParser(CharSequence text) { super(); this.cursor = 0; this.text = text; this.length = text.length(); this.wordType = WordType.UNDEFINED; } /** * Retrieves the character at the current cursor position. * * @return The character retrieved from the string at the current cursor position or '\0' if the * position is beyond the end of the text */ public char character() { return character(cursor); } /** * Retrieves the character at the given cursor position. * * @param position The position of the character to return * @return The character retrieved from the string at the given position or '\0' if the position * is beyond the end of the text */ public char character(int position) { return (position >= length) ? '\0' : text.charAt(position); } /** * Determines whether the query ends with the given suffix and the end position is the end of the * range for testing. * * @param endPosition The position where the check stops * @param suffix The suffix is the text that is used to match it with the substring within the text * @return <code>true</code> if the character sequence represented by the argument is a suffix of * the query; <code>false</code> otherwise */ public boolean endsWith(int endPosition, String suffix) { return startsWith(suffix, endPosition - suffix.length()); } /** * Determines whether the query ends with the given suffix and the end position is the end of the * range for testing. The case of the character is ignored. * * @param endPosition The position where the check stops * @param suffix The suffix is the text that is used to match it with the substring within the text * @return <code>true</code> if the character sequence represented by the argument is a suffix of * the query; <code>false</code> otherwise * @since 2.5 */ public boolean endsWithIgnoreCase(int endPosition, String suffix) { return startsWithIgnoreCase(suffix, endPosition - suffix.length()); } /** * Retrieves a word starting at the current position. The text before and after the position will * be part of the returned value. * <p> * For instance, "<b>SELECT</b> <b>AVG</b>(e.age) <b>FROM</b> Employee e": * <ul> * <li>Position 3, result is "SELECT"; * <li>Position 6, result is "SELECT"; * <li>Position 7, result is an empty string. * <li>Position 11, result is an empty string. * <li>Position 13, result is "e.". * </ul> * * @return The word in which the cursor is */ public String entireWord() { return entireWord(cursor); } /** * Retrieves a word starting at the given position. The text before and after the position will * be part of the returned value. * <p> * For instance, "<b>SELECT</b> <b>AVG</b>(e.age) <b>FROM</b> Employee e": * <ul> * <li>Position 3, result is "SELECT"; * <li>Position 6, result is "SELECT"; * <li>Position 7, result is an empty string. * <li>Position 11, result is an empty string. * <li>Position 13, result is "e.". * </ul> * * @param position The position where to retrieve the word * @return The word in which the cursor is */ public String entireWord(int position) { int startPosition = partialWordStartPosition(position); int endPosition = wordEndPosition(position); return substring(startPosition, endPosition); } /** * Returns what the type of word {@link #word()} returns. * * @return The category of the word returned by {@link #word()} * @since 2.4 */ public WordType getWordType() { return wordType; } /** * Determines whether the given character is an arithmetic symbol, which is one of the following: * { {@literal '>', '<', '/', '*', '-', '+', '=', '{'} }. * * @param character The character to test if it's a math symbol * @return <code>true</code> if the given character is one of the valid math symbols; * <code>false</code> otherwise */ public boolean isArithmeticSymbol(char character) { return character == '>' || character == '!' || character == '<' || character == '/' || character == '*' || character == '-' || character == '+' || character == '=' || character == '{'; } /** * Determines whether the given character is a delimiter. The delimiter are '(', ')' and ','. * * @param character The character to test * @return <code>true</code> if the given character is a delimiter; <code>false</code> otherwise */ public boolean isDelimiter(char character) { return character == '(' || character == ')' || character == ','; } /** * Determines whether the given character is a character that can be used in a number. This only * includes the numeric characters [0, 9] and the period character. This method should only be * used to determine if a word starts with a digit character. * * @param character The character to test if it's a digit * @return <code>true</code> if the given character is a digit; <code>false</code> otherwise */ public boolean isDigit(char character) { return (character == '.') || Character.isDigit(character); } /** * Determines whether the position of the cursor is at the end of the text. * * @return <code>true</code> if the position of the cursor is at the end of the text; * <code>false</code> otherwise */ public boolean isTail() { return cursor >= length; } /** * Determines whether the given character is not considered to be part of a word (which is * usually comprise of alphanumeric characters). * * @param character The character used to determine if it should be part of a word or not * @return <code>true</code> if the character can be part of a word; <code>false</code> if it is * not an alphanumeric character, which usually means is a whitespace, a delimiter or an * arithmetic symbol * @see Character#isWhitespace(char) * @see #isArithmeticSymbol(char) * @see #isDelimiter(char) */ public boolean isWordSeparator(char character) { return Character.isWhitespace(character) || isDelimiter(character) || isArithmeticSymbol(character); } /** * Returns the length of the string value. * * @return The total count of characters */ public int length() { return length; } /** * Moves the position of the cursor by the length of the given word. * * @param word The word used to determine how much to move the position forward */ public void moveBackward(CharSequence word) { cursor -= word.length(); } /** * Moves backward the position of the cursor by the given amount. * * @param position The amount to remove from the current position */ public void moveBackward(int position) { cursor -= position; } /** * Moves the position of the cursor by the length of the given word. * * @param word The word used to determine how much to move the position forward * @return The actual portion of the text that was skipped */ public String moveForward(CharSequence word) { return moveForward(word.length()); } /** * Moves forward the position of the cursor by the given amount. * * @param position The amount to add to the current position * @return The actual portion of the text that was skipped */ public String moveForward(int position) { String word = substring(cursor, cursor + position); cursor += position; return word; } /** * Moves the position of the cursor by the length of the given word and ignore any different in * whitespace count. If the text has more than one whitespace and the given word usually has one, * then only one will be part of the returned substring. * * @param word The word used to determine how much to move the position forward * @return The actual portion of the text that was skipped * @since 2.4.3 */ public String moveForwardIgnoreWhitespace(CharSequence word) { StringBuilder sb = new StringBuilder(); int pc = word.length(); int po = 0; while (--pc >= 0) { char c1 = text.charAt(cursor++); char c2 = word.charAt(po++); // Handle testing // 1. "... GROUP BY" and "GROUP BY" // 2. "... GROUP\nBY" and "GROUP BY" if (Character.isWhitespace(c1)) { if (Character.isWhitespace(c2)) { sb.append(' '); continue; } pc++; po--; continue; } sb.append(c1); } return sb.toString(); } /** * Retrieves the numeric literal that should be the current word to parse. * * @return The numeric literal value */ public String numericLiteral() { return substring(cursor, scanNumericLiteral(cursor)); } /** * Retrieves a word before the current position of the cursor, which determines when the parsing * stop. * <p> * For instance, "<b>SELECT</b> <b>AVG</b>(e.age) <b>FROM</b> Employee e": * <ul> * <li>Position 3, result is "SEL"; * <li>Position 6, result is "SELECT"; * <li>Position 7, result is an empty string. * <li>Position 11, result is an empty string. * <li>Position 13, result is "e.". * </ul> * * @return The sub-string that is before the position */ public String partialWord() { int startIndex = partialWordStartPosition(cursor); return substring(startIndex, cursor); } /** * Retrieves a word before the specified position, which determines when the parsing stop. * <p> * For instance, "<b>SELECT</b> <b>AVG</b>(e.age) <b>FROM</b> Employee e": * <ul> * <li>Position 3, result is "SEL"; * <li>Position 6, result is "SELECT"; * <li>Position 7, result is an empty string. * <li>Position 11, result is an empty string. * <li>Position 13, result is "e.". * </ul> * * @param position The position of the cursor * @return The sub-string that is before the position */ public String partialWord(int position) { int startIndex = partialWordStartPosition(position); return substring(startIndex, position); } /** * Finds the beginning of the word and the given position is within that word. * <p> * For instance, "<b>SELECT</b> <b>AVG</b>(e.age) <b>FROM</b> Employee e": * <ul> * <li>Position 3, result is 0; * <li>Position 8, result is 7; * </ul> * * @param position The position from which the search ends * @return The position, which is a smaller number or equal, than the given position */ public int partialWordStartPosition(int position) { int startIndex = position; for (int index = position; --index >= 0; ) { char character = text.charAt(index); if (Character.isWhitespace(character) || isDelimiter(character) || isArithmeticSymbol(character)) { break; } startIndex--; } return startIndex; } /** * Returns the current position of the cursor. * * @return The current position of the cursor */ public int position() { return cursor; } private int scanNumericLiteral(int startPosition) { int endIndex = startPosition; boolean digitParsed = false; boolean dotParsed = false; for (; endIndex < length; endIndex++) { char character = text.charAt(endIndex); // Usual digit if (character >= '0' && character <= '9') { digitParsed = true; } // The arithmetic sign before the number else if ((character == '+' || character == '-')) { // '+' or '-' is only valid at the beginning of the literal if (endIndex > startPosition) { break; } } // The separator of integer and decimal values else if (character == '.') { // A '.' was already parsed, it is not a valid numeric literal if (dotParsed) { endIndex = startPosition + 1; wordType = WordType.WORD; break; } dotParsed = true; } // Hexadecimal value else if (character == 'x') { boolean powerParsed = false; for (; endIndex < length; endIndex++) { character = text.charAt(endIndex); if (character == 'p' || character == 'P') { powerParsed = true; } else if (powerParsed && (character == '+' || character == '-')) { continue; } else if (isWordSeparator(character)) { break; } } break; } // Parse the exponent else if (character == 'e' || character == 'E') { if (!digitParsed) { wordType = WordType.WORD; break; } for (int index = ++endIndex; index < length; index++) { character = text.charAt(index); // The first character can be '+', '-' or a number if ((index == endIndex) && (character == '-' || character == '+') || character >= '0' && character <= '9') { endIndex++; continue; } if (character == '.') { wordType = WordType.WORD; } // If it is not a character like '(', then it's not a valid number if (isWordSeparator(character)) { break; } endIndex++; } break; } // A float/double or long number else if (character == 'f' || character == 'F' || character == 'd' || character == 'D' || character == 'l' || character == 'L') { // A single arithmetic symbol // Example: "-LENGTH..." -> "-" if (!digitParsed) { wordType = WordType.WORD; break; } endIndex++; // End of the text if (endIndex == length) { break; } character = text.charAt(endIndex); // Done parsing the numeric literal if (isWordSeparator(character)) { break; } } // Example: "-AVG..." -> "-" else if (!digitParsed && Character.isJavaIdentifierPart(character)) { wordType = WordType.WORD; break; } // Done parsing the numeric literal else if (isWordSeparator(character)) { break; } } return endIndex; } /** * Retrieves the first word from the given text starting at the specified position. * * @param startPosition the current position that will be used to parse the literal * @return The first word contained in the text, if none could be found, then an empty string is * returned */ private int scanStringLiteral(int startPosition) { int endIndex = startPosition; char startQuote = text.charAt(endIndex); for (endIndex++; endIndex < length; endIndex++) { char character = text.charAt(endIndex); if (character == startQuote) { endIndex++; // Verify the single quote is escaped with another single quote if ((startQuote == '\'') && (endIndex < length)) { char nextCharacter = text.charAt(endIndex); // The single quote is escaped, continue if (nextCharacter == '\'') { continue; } } // Verify the double quote is escaped with backslash else if ((startQuote == '\"') && (endIndex - 2 > startPosition)) { char previousCharacter = text.charAt(endIndex - 2); // The double quote is escaped, continue if (previousCharacter == '\\') { continue; } } // Reached the end of the string literal break; } } return endIndex; } /** * Manually sets the position of the cursor within the string. If the position is a negative * number, the position will be 0. * * @param position The new position of the cursor */ public void setPosition(int position) { this.cursor = (position < 0) ? 0 : position; } /** * Removes the whitespace that starts the given text. * * @return The number of whitespace removed */ public int skipLeadingWhitespace() { int count = 0; while (cursor < length) { char character = text.charAt(cursor); if (!Character.isWhitespace(character)) { break; } cursor++; count++; } return count; } /** * Determines whether the text starts with the given character. The case of the character is not * ignored. * * @param possibleCharacter The possible character at the current position * @return <code>true</code> if the text starts with the given character at the current position; * <code>false</code> otherwise */ public boolean startsWith(char possibleCharacter) { return possibleCharacter == character(); } /** * Tests whether the query starts with the specified prefix from the current position. * * @param prefix The prefix * @return <code>true</code> if the character sequence represented by the argument is a prefix of * the text; <code>false</code> otherwise */ public boolean startsWith(CharSequence prefix) { return startsWith(prefix, cursor); } /** * Tests whether the substring of the query beginning at the specified index starts with the * specified prefix. * * @param prefix The prefix * @param startIndex Where to begin looking in the query * @return <code>true</code> if the character sequence represented by the * argument is a prefix of the substring of this object starting at index <code>startIndex</code>; * <code>false</code> otherwise */ public boolean startsWith(CharSequence prefix, int startIndex) { int prefixLength = prefix.length(); // Note: startIndex might be near -1 >>> 1 if ((startIndex < 0) || (startIndex > length - prefixLength)) { return false; } int prefixIndex = 0; while (--prefixLength >= 0) { if (text.charAt(startIndex++) != prefix.charAt(prefixIndex++)) { return false; } } return true; } /** * Determines whether the character at the current position is one of the arithmetic operators: * { '+', '-', '*', '/' }, * * @return <code>true</code> if the character at the current position is an arithmetic operator; * <code>false</code> otherwise */ public boolean startsWithArithmeticOperator() { char character = text.charAt(cursor); return (character == '+') || (character == '/') || (character == '-') || (character == '*'); } /** * Determines if the text starts with a digit (<code>true</code>), an arithmetic term * (<code>false</code>) or anything else (<code>null</code>). * * @return <code>true</code> if the text starts with a digit (we'll assume it is a digit if the * text starts with a digit or an arithmetic sign followed by a digit), <code>false</code> if it * starts with an arithmetic term (we'll assume it is a digit followed by a non-digit character); * otherwise returns <code>null</code> */ public Boolean startsWithDigit() { char character = character(); // Check if the first character is either '+' or '-' and make sure it's not used for a numeric // value, which in that case, a numeric value will be created if (character == '-' || character == '+') { moveForward(1); int count = skipLeadingWhitespace(); character = character(cursor); moveBackward(count + 1); if (isDigit(character)) { return Boolean.TRUE; } if (character == '-' || character == '+' || character == '*' || character == '/') { return null; } return Boolean.FALSE; } if (character == '.') { return isDigit(character(cursor + 1)); } if (isDigit(character)) { return Boolean.TRUE; } return null; } /** * Determines whether the text at the current position start with the following identifier. * * @param identifier The JPQL identifier to match with the text at the current position * @return <code>true</code> if the text starts with the given text (case is ignored) and the * cursor is at the end of the text or is following by a word separator character; <code>false</code> * otherwise */ public boolean startsWithIdentifier(CharSequence identifier) { return startsWithIdentifier(identifier, cursor); } /** * Determines whether the text at the current position start with the following identifier. * * @param identifier The JPQL identifier to match with the text at the current position * @param position The position to start matching the characters * @return <code>true</code> if the text starts with the given text (case is ignored) and the * cursor is at the end of the text or is following by a word separator character; <code>false</code> * otherwise */ public boolean startsWithIdentifier(CharSequence identifier, int position) { int pc = identifier.length(); // Note: offset might be near -1 >>> 1 if ((position < 0) || (position > length - pc)) { return false; } int po = 0; int to = position; while (--pc >= 0) { char c1 = text.charAt(to++); char c2 = identifier.charAt(po++); if (c1 == c2) { continue; } // Handle testing // 1. "... GROUP BY" and "GROUP BY" // 2. "... GROUP\nBY" and "GROUP BY" if (Character.isWhitespace(c1)) { if (Character.isWhitespace(c2)) { continue; } if (to == length) { return false; } pc++; po--; continue; } // If characters don't match but case may be ignored, try converting // both characters to uppercase. If the results match, then the // comparison scan should continue char u1 = Character.toUpperCase(c1); char u2 = Character.toUpperCase(c2); if (u1 != u2) { return false; } // Unfortunately, conversion to uppercase does not work properly for // the Georgian alphabet, which has strange rules about case // conversion. So we need to make one last check before exiting if (Character.toLowerCase(u1) != Character.toLowerCase(u2)) { return false; } } // End of the text if (to == length) { return true; } // Check to see if the next character is a word separator char character = text.charAt(to); return isWordSeparator(character); } /** * Determines whether the text starts with the given character. The case of the character is ignored. * * @param possibleCharacter The possible character at the current position * @return <code>true</code> if the text starts with the given character at the current position; * <code>false</code> otherwise */ public boolean startsWithIgnoreCase(char possibleCharacter) { char character = character(); return possibleCharacter == character || possibleCharacter == Character.toUpperCase(character); } /** * Tests if the string starts with the specified prefix. The case of the character is ignored. * * @param prefix The prefix to test against * @return <code>true</code> if the character sequence represented by the argument is a prefix of * the character sequence represented by this string; <code>false</code> otherwise. Note also * that <code>true</code> will be returned if the argument is an empty string or is equal to this * <code>String</code> object as determined by the {@link #equals(Object)} method */ public boolean startsWithIgnoreCase(CharSequence prefix) { return startsWithIgnoreCase(prefix, cursor); } /** * Tests if the string starts with the specified prefix. The case of the character is ignored. * * @param prefix The prefix to test against * @param offset Where to begin looking in this string * @return <code>true</code> if the character sequence represented by the argument is a prefix of * the character sequence represented by this string; <code>false</code> otherwise */ public boolean startsWithIgnoreCase(CharSequence prefix, int offset) { int pc = prefix.length(); // Note: offset might be near -1 >>> 1 if ((offset < 0) || (offset > length - pc)) { return false; } int po = 0; int to = offset; while (--pc >= 0) { char c1 = text.charAt(to++); char c2 = prefix.charAt(po++); if (c1 == c2) { continue; } // If characters don't match but case may be ignored, try converting // both characters to uppercase. If the results match, then the // comparison scan should continue char u1 = Character.toUpperCase(c1); char u2 = Character.toUpperCase(c2); if (u1 != u2) { return false; } // Unfortunately, conversion to uppercase does not work properly for // the Georgian alphabet, which has strange rules about case // conversion. So we need to make one last check before exiting if (Character.toLowerCase(u1) != Character.toLowerCase(u2)) { return false; } } return true; } /** * Returns a substring that is within the current position of the cursor and the end of the text. * * @return The remain of the string starting at the current position */ public String substring() { return substring(cursor); } /** * Returns a substring that is within the given position and the end of the text. * * @param startIndex The beginning of the substring, inclusive * @return The remain of the string starting at the given position */ public String substring(int startIndex) { return substring(startIndex, length); } /** * Returns a substring that is within the given positions. * * @param startIndex The beginning of the substring, inclusive * @param endIndex The end of the substring, exclusive * @return The remain of the string that is within the given positions */ public String substring(int startIndex, int endIndex) { return text.subSequence(startIndex, endIndex).toString(); } /** * {@inheritDoc} */ @Override public String toString() { return isTail() ? ExpressionTools.EMPTY_STRING : substring(); } /** * Calculates the number of whitespace that are in the query. The check starts at the current position. * * @return The count of consecutive whitespace found from the current position */ public int whitespaceCount() { return whitespaceCount(cursor); } /** * Calculates the number of whitespace that are in the query. The check starts at the current position. * * @param position The position from where the scan starts * @return The count of consecutive whitespace found from the given position */ public int whitespaceCount(int position) { for (int index = position; index < length; index++) { char character = text.charAt(index); if (!Character.isWhitespace(character)) { return index - position; } } return 0; } /** * Retrieves the first word starting at the current position. * * @return The first word contained in the text, if none could be found, * then an empty string is returned */ public String word() { return substring(cursor, wordEndPosition()); } /** * Returns the position a word would end based on the current cursor position. {@link #getWordType()} * can be used to determine the type of word that was scanned. * * @return The position where the current word ends * @see #word() WordParser.word() * @see WordType */ public int wordEndPosition() { return wordEndPosition(cursor); } /** * Returns the position a word would end based on the given start position. {@link #getWordType()} * can be used to determine the type of word that was scanned. * * @param position The position to start scanning the text * @return The position where the current word ends * @see #word() WordParser.word() * @see WordType */ public int wordEndPosition(int position) { if (position >= length) { wordType = WordType.UNDEFINED; return position; } char character = text.charAt(position); int endIndex = position + 1; wordType = WordType.WORD; // Parse a string literal if (ExpressionTools.isQuote(character)) { wordType = WordType.STRING_LITERAL; // The quote is the end quote if (position > 1) { character = character(position - 1); if (ExpressionTools.isQuote(character)) { return endIndex; } } return scanStringLiteral(position); } // Parse an input parameter if (ExpressionTools.isParameter(character)) { wordType = WordType.INPUT_PARAMETER; for (; endIndex < length; endIndex++) { character = text.charAt(endIndex); // Special case for '!=' if ((character == '!') && (endIndex + 1 < length)) { character = text.charAt(endIndex + 1); if (character == '=') { break; } endIndex++; continue; } if (isWordSeparator(character)) { break; } } return endIndex; } // Parse an arithmetic symbol if (character == '/' || character == '*' || character == '+' || character == '-') { return endIndex; } // Parse JDBC date if (character == '{') { // TODO return endIndex; } // Parse a numeric literal if (isDigit(character)) { wordType = WordType.NUMERIC_LITERAL; endIndex = scanNumericLiteral(position); // The word is a valid numeric literal, stop now, // otherwise scan for the entire word if (wordType == WordType.NUMERIC_LITERAL) { return endIndex; } } // '=' else if (character == '=') { return endIndex; } // <, <>, <= else if (character == '<') { if (endIndex < length) { character = text.charAt(endIndex); if (character == '>' || character == '=') { return endIndex + 1; } return endIndex; } } // >, >=, != else if (character == '>' || character == '!') { // End of the text if (endIndex == length) { return endIndex; } // Scan the next character char nextCharacter = text.charAt(endIndex); if (nextCharacter == '=') { return ++endIndex; } if (character == '>') { return endIndex; } } // Done scanning else if (isWordSeparator(character)) { return --endIndex; } // Scan for an entire word for (int index = endIndex; index < length; index++) { character = text.charAt(index); // Special case for '!=' if ((character == '!') && (endIndex + 1 < length)) { character = text.charAt(index + 1); if (character == '=') { return endIndex; } endIndex++; continue; } if (Character.isWhitespace(character) || isDelimiter(character) || character == '>' || character == '<' || character == '/' || character == '*' || character == '-' || character == '+' || character == '=') { break; } // Continue to the next character endIndex++; } return endIndex; } /** * This enumeration determines the type of word that was scanned. It will be set by {@link * WordParser#word()}, {@link WordParser#wordEndPosition()}, {@link WordParser#wordEndPosition(int)}. */ public enum WordType { /** * The word being scanned is an input parameter, it starts with either ':' or '?'. */ INPUT_PARAMETER, /** * The word being scanned is a numeric literal (decimal or hexadecimal number). */ NUMERIC_LITERAL, /** * The word being scanned is a string literal, it starts with either ''' or '"'. */ STRING_LITERAL, /** * No word was scanned, this is usually set when the cursor is at the end of the text. */ UNDEFINED, /** * The word being scanned anything else other than an input parameter, numeric literal or * string literal. */ WORD } }