/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.reverse; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.Version; import java.io.IOException; /** * Reverse token string, for example "country" => "yrtnuoc". * <p> * If <code>marker</code> is supplied, then tokens will be also prepended by * that character. For example, with a marker of \u0001, "country" => * "\u0001yrtnuoc". This is useful when implementing efficient leading * wildcards search. * </p> * <a name="version"/> * <p>You must specify the required {@link Version} * compatibility when creating ReverseStringFilter, or when using any of * its static methods: * <ul> * <li> As of 3.1, supplementary characters are handled correctly * </ul> */ public final class ReverseStringFilter extends TokenFilter { private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final char marker; private final Version matchVersion; private static final char NOMARKER = '\uFFFF'; /** * Example marker character: U+0001 (START OF HEADING) */ public static final char START_OF_HEADING_MARKER = '\u0001'; /** * Example marker character: U+001F (INFORMATION SEPARATOR ONE) */ public static final char INFORMATION_SEPARATOR_MARKER = '\u001F'; /** * Example marker character: U+EC00 (PRIVATE USE AREA: EC00) */ public static final char PUA_EC00_MARKER = '\uEC00'; /** * Example marker character: U+200F (RIGHT-TO-LEFT MARK) */ public static final char RTL_DIRECTION_MARKER = '\u200F'; /** * Create a new ReverseStringFilter that reverses all tokens in the * supplied {@link TokenStream}. * <p> * The reversed tokens will not be marked. * </p> * * @param in {@link TokenStream} to filter * @deprecated use {@link #ReverseStringFilter(Version, TokenStream)} * instead. This constructor will be removed in Lucene 4.0 */ @Deprecated public ReverseStringFilter(TokenStream in) { this(in, NOMARKER); } /** * Create a new ReverseStringFilter that reverses and marks all tokens in the * supplied {@link TokenStream}. * <p> * The reversed tokens will be prepended (marked) by the <code>marker</code> * character. * </p> * * @param in {@link TokenStream} to filter * @param marker A character used to mark reversed tokens * @deprecated use {@link #ReverseStringFilter(Version, TokenStream, char)} * instead. This constructor will be removed in Lucene 4.0 */ @Deprecated public ReverseStringFilter(TokenStream in, char marker) { this(Version.LUCENE_30, in, marker); } /** * Create a new ReverseStringFilter that reverses all tokens in the * supplied {@link TokenStream}. * <p> * The reversed tokens will not be marked. * </p> * * @param matchVersion See <a href="#version">above</a> * @param in {@link TokenStream} to filter */ public ReverseStringFilter(Version matchVersion, TokenStream in) { this(matchVersion, in, NOMARKER); } /** * Create a new ReverseStringFilter that reverses and marks all tokens in the * supplied {@link TokenStream}. * <p> * The reversed tokens will be prepended (marked) by the <code>marker</code> * character. * </p> * * @param matchVersion See <a href="#version">above</a> * @param in {@link TokenStream} to filter * @param marker A character used to mark reversed tokens */ public ReverseStringFilter(Version matchVersion, TokenStream in, char marker) { super(in); this.matchVersion = matchVersion; this.marker = marker; } @Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { int len = termAtt.length(); if (marker != NOMARKER) { len++; termAtt.resizeBuffer(len); termAtt.buffer()[len - 1] = marker; } reverse( matchVersion, termAtt.buffer(), 0, len ); termAtt.setLength(len); return true; } else { return false; } } /** * Reverses the given input string * * @param input the string to reverse * @return the given input string in reversed order * @deprecated use {@link #reverse(Version, String)} instead. This method * will be removed in Lucene 4.0 */ @Deprecated public static String reverse( final String input ){ return reverse(Version.LUCENE_30, input); } /** * Reverses the given input string * * @param matchVersion See <a href="#version">above</a> * @param input the string to reverse * @return the given input string in reversed order */ public static String reverse( Version matchVersion, final String input ){ final char[] charInput = input.toCharArray(); reverse( matchVersion, charInput, 0, charInput.length ); return new String( charInput ); } /** * Reverses the given input buffer in-place * @param buffer the input char array to reverse * @deprecated use {@link #reverse(Version, char[])} instead. This * method will be removed in Lucene 4.0 */ @Deprecated public static void reverse( final char[] buffer ){ reverse( buffer, 0, buffer.length ); } /** * Reverses the given input buffer in-place * @param matchVersion See <a href="#version">above</a> * @param buffer the input char array to reverse */ public static void reverse(Version matchVersion, final char[] buffer) { reverse(matchVersion, buffer, 0, buffer.length); } /** * Partially reverses the given input buffer in-place from offset 0 * up to the given length. * @param buffer the input char array to reverse * @param len the length in the buffer up to where the * buffer should be reversed * @deprecated use {@link #reverse(Version, char[], int)} instead. This * method will be removed in Lucene 4.0 */ @Deprecated public static void reverse( final char[] buffer, final int len ){ reverse( buffer, 0, len ); } /** * Partially reverses the given input buffer in-place from offset 0 * up to the given length. * @param matchVersion See <a href="#version">above</a> * @param buffer the input char array to reverse * @param len the length in the buffer up to where the * buffer should be reversed */ public static void reverse(Version matchVersion, final char[] buffer, final int len) { reverse( matchVersion, buffer, 0, len ); } /** * Partially reverses the given input buffer in-place from the given offset * up to the given length. * @param buffer the input char array to reverse * @param start the offset from where to reverse the buffer * @param len the length in the buffer up to where the * buffer should be reversed * @deprecated use {@link #reverse(Version, char[], int, int)} instead. This * method will be removed in Lucene 4.0 */ @Deprecated public static void reverse(char[] buffer, int start, int len ) { reverseUnicode3(buffer, start, len); } /** * @deprecated Remove this when support for 3.0 indexes is no longer needed. */ @Deprecated private static void reverseUnicode3( char[] buffer, int start, int len ){ if( len <= 1 ) return; int num = len>>1; for( int i = start; i < ( start + num ); i++ ){ char c = buffer[i]; buffer[i] = buffer[start * 2 + len - i - 1]; buffer[start * 2 + len - i - 1] = c; } } /** * Partially reverses the given input buffer in-place from the given offset * up to the given length. * @param matchVersion See <a href="#version">above</a> * @param buffer the input char array to reverse * @param start the offset from where to reverse the buffer * @param len the length in the buffer up to where the * buffer should be reversed */ public static void reverse(Version matchVersion, final char[] buffer, final int start, final int len) { if (!matchVersion.onOrAfter(Version.LUCENE_31)) { reverseUnicode3(buffer, start, len); return; } /* modified version of Apache Harmony AbstractStringBuilder reverse0() */ if (len < 2) return; int end = (start + len) - 1; char frontHigh = buffer[start]; char endLow = buffer[end]; boolean allowFrontSur = true, allowEndSur = true; final int mid = start + (len >> 1); for (int i = start; i < mid; ++i, --end) { final char frontLow = buffer[i + 1]; final char endHigh = buffer[end - 1]; final boolean surAtFront = allowFrontSur && Character.isSurrogatePair(frontHigh, frontLow); if (surAtFront && (len < 3)) { // nothing to do since surAtFront is allowed and 1 char left return; } final boolean surAtEnd = allowEndSur && Character.isSurrogatePair(endHigh, endLow); allowFrontSur = allowEndSur = true; if (surAtFront == surAtEnd) { if (surAtFront) { // both surrogates buffer[end] = frontLow; buffer[--end] = frontHigh; buffer[i] = endHigh; buffer[++i] = endLow; frontHigh = buffer[i + 1]; endLow = buffer[end - 1]; } else { // neither surrogates buffer[end] = frontHigh; buffer[i] = endLow; frontHigh = frontLow; endLow = endHigh; } } else { if (surAtFront) { // surrogate only at the front buffer[end] = frontLow; buffer[i] = endLow; endLow = endHigh; allowFrontSur = false; } else { // surrogate only at the end buffer[end] = frontHigh; buffer[i] = endHigh; frontHigh = frontLow; allowEndSur = false; } } } if ((len & 0x01) == 1 && !(allowFrontSur && allowEndSur)) { // only if odd length buffer[end] = allowFrontSur ? endLow : frontHigh; } } }