package org.apache.lucene.queryParser.standard.parser; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Locale; import org.apache.lucene.messages.MessageImpl; import org.apache.lucene.queryParser.core.messages.QueryParserMessages; import org.apache.lucene.queryParser.core.parser.EscapeQuerySyntax; import org.apache.lucene.queryParser.core.util.UnescapedCharSequence; /** */ public class EscapeQuerySyntaxImpl implements EscapeQuerySyntax { private static final char[] wildcardChars = { '*', '?' }; private static final String[] escapableTermExtraFirstChars = { "+", "-", "@" }; private static final String[] escapableTermChars = { "\"", "<", ">", "=", "!", "(", ")", "^", "[", "{", ":", "]", "}", "~" }; // TODO: check what to do with these "*", "?", "\\" private static final String[] escapableQuotedChars = { "\"" }; private static final String[] escapableWhiteChars = { " ", "\t", "\n", "\r", "\f", "\b", "\u3000" }; private static final String[] escapableWordTokens = { "AND", "OR", "NOT", "TO", "WITHIN", "SENTENCE", "PARAGRAPH", "INORDER" }; private static final CharSequence escapeChar(CharSequence str, Locale locale) { if (str == null || str.length() == 0) return str; CharSequence buffer = str; // regular escapable Char for terms for (int i = 0; i < escapableTermChars.length; i++) { buffer = replaceIgnoreCase(buffer, escapableTermChars[i].toLowerCase(), "\\", locale); } // First Character of a term as more escaping chars for (int i = 0; i < escapableTermExtraFirstChars.length; i++) { if (buffer.charAt(0) == escapableTermExtraFirstChars[i].charAt(0)) { buffer = "\\" + buffer.charAt(0) + buffer.subSequence(1, buffer.length()); break; } } return buffer; } private final CharSequence escapeQuoted(CharSequence str, Locale locale) { if (str == null || str.length() == 0) return str; CharSequence buffer = str; for (int i = 0; i < escapableQuotedChars.length; i++) { buffer = replaceIgnoreCase(buffer, escapableTermChars[i].toLowerCase(), "\\", locale); } return buffer; } private static final CharSequence escapeTerm(CharSequence term, Locale locale) { if (term == null) return term; // Escape single Chars term = escapeChar(term, locale); term = escapeWhiteChar(term, locale); // Escape Parser Words for (int i = 0; i < escapableWordTokens.length; i++) { if (escapableWordTokens[i].equalsIgnoreCase(term.toString())) return "\\" + term; } return term; } /** * replace with ignore case * * @param string * string to get replaced * @param sequence1 * the old character sequence in lowercase * @param escapeChar * the new character to prefix sequence1 in return string. * @return the new String */ private static CharSequence replaceIgnoreCase(CharSequence string, CharSequence sequence1, CharSequence escapeChar, Locale locale) { if (escapeChar == null || sequence1 == null || string == null) throw new NullPointerException(); // empty string case int count = string.length(); int sequence1Length = sequence1.length(); if (sequence1Length == 0) { StringBuilder result = new StringBuilder((count + 1) * escapeChar.length()); result.append(escapeChar); for (int i = 0; i < count; i++) { result.append(string.charAt(i)); result.append(escapeChar); } return result.toString(); } // normal case StringBuilder result = new StringBuilder(); char first = sequence1.charAt(0); int start = 0, copyStart = 0, firstIndex; while (start < count) { if ((firstIndex = string.toString().toLowerCase(locale).indexOf(first, start)) == -1) break; boolean found = true; if (sequence1.length() > 1) { if (firstIndex + sequence1Length > count) break; for (int i = 1; i < sequence1Length; i++) { if (string.toString().toLowerCase(locale).charAt(firstIndex + i) != sequence1 .charAt(i)) { found = false; break; } } } if (found) { result.append(string.toString().substring(copyStart, firstIndex)); result.append(escapeChar); result.append(string.toString().substring(firstIndex, firstIndex + sequence1Length)); copyStart = start = firstIndex + sequence1Length; } else { start = firstIndex + 1; } } if (result.length() == 0 && copyStart == 0) return string; result.append(string.toString().substring(copyStart)); return result.toString(); } /** * escape all tokens that are part of the parser syntax on a given string * * @param str * string to get replaced * @param locale * locale to be used when performing string compares * @return the new String */ private static final CharSequence escapeWhiteChar(CharSequence str, Locale locale) { if (str == null || str.length() == 0) return str; CharSequence buffer = str; for (int i = 0; i < escapableWhiteChars.length; i++) { buffer = replaceIgnoreCase(buffer, escapableWhiteChars[i].toLowerCase(), "\\", locale); } return buffer; } public CharSequence escape(CharSequence text, Locale locale, Type type) { if (text == null || text.length() == 0) return text; // escape wildcards and the escape char (this has to be perform before // anything else) // since we need to preserve the UnescapedCharSequence and escape the // original escape chars if (text instanceof UnescapedCharSequence) { text = ((UnescapedCharSequence) text).toStringEscaped(wildcardChars); } else { text = new UnescapedCharSequence(text).toStringEscaped(wildcardChars); } if (type == Type.STRING) { return escapeQuoted(text, locale); } else { return escapeTerm(text, locale); } } /** * Returns a String where the escape char has been removed, or kept only once * if there was a double escape. * * Supports escaped unicode characters, e. g. translates <code>A</code> to * <code>A</code>. * */ public static UnescapedCharSequence discardEscapeChar(CharSequence input) throws ParseException { // Create char array to hold unescaped char sequence char[] output = new char[input.length()]; boolean[] wasEscaped = new boolean[input.length()]; // The length of the output can be less than the input // due to discarded escape chars. This variable holds // the actual length of the output int length = 0; // We remember whether the last processed character was // an escape character boolean lastCharWasEscapeChar = false; // The multiplier the current unicode digit must be multiplied with. // E. g. the first digit must be multiplied with 16^3, the second with // 16^2... int codePointMultiplier = 0; // Used to calculate the codepoint of the escaped unicode character int codePoint = 0; for (int i = 0; i < input.length(); i++) { char curChar = input.charAt(i); if (codePointMultiplier > 0) { codePoint += hexToInt(curChar) * codePointMultiplier; codePointMultiplier >>>= 4; if (codePointMultiplier == 0) { output[length++] = (char) codePoint; codePoint = 0; } } else if (lastCharWasEscapeChar) { if (curChar == 'u') { // found an escaped unicode character codePointMultiplier = 16 * 16 * 16; } else { // this character was escaped output[length] = curChar; wasEscaped[length] = true; length++; } lastCharWasEscapeChar = false; } else { if (curChar == '\\') { lastCharWasEscapeChar = true; } else { output[length] = curChar; length++; } } } if (codePointMultiplier > 0) { throw new ParseException(new MessageImpl( QueryParserMessages.INVALID_SYNTAX_ESCAPE_UNICODE_TRUNCATION)); } if (lastCharWasEscapeChar) { throw new ParseException(new MessageImpl( QueryParserMessages.INVALID_SYNTAX_ESCAPE_CHARACTER)); } return new UnescapedCharSequence(output, wasEscaped, 0, length); } /** Returns the numeric value of the hexadecimal character */ private static final int hexToInt(char c) throws ParseException { if ('0' <= c && c <= '9') { return c - '0'; } else if ('a' <= c && c <= 'f') { return c - 'a' + 10; } else if ('A' <= c && c <= 'F') { return c - 'A' + 10; } else { throw new ParseException(new MessageImpl( QueryParserMessages.INVALID_SYNTAX_ESCAPE_NONE_HEX_UNICODE, c)); } } }