ISOLatin1AccentFilter.java example

Explorer
pylucene-master
package org.apache.lucene.analysis;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

/**
 * A filter that replaces accented characters in the ISO Latin 1 character set 
 * (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
 * <p>
 * For instance, 'à' will be replaced by 'a'.
 * <p>
 * 
 * @deprecated If you build a new index, use {@link ASCIIFoldingFilter}
 * which covers a superset of Latin 1.
 * This class is included for use with existing
 * indexes and will be removed in a future release (possibly Lucene 4.0).
 */
@Deprecated
public final class ISOLatin1AccentFilter extends TokenFilter {
  public ISOLatin1AccentFilter(TokenStream input) {
    super(input);
  }

  private char[] output = new char[256];
  private int outputPos;
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    
  @Override
  public final boolean incrementToken() throws java.io.IOException {    
    if (input.incrementToken()) {
      final char[] buffer = termAtt.buffer();
      final int length = termAtt.length();
      // If no characters actually require rewriting then we
      // just return token as-is:
      for(int i=0;i<length;i++) {
        final char c = buffer[i];
        if (c >= '\u00c0' && c <= '\uFB06') {
          removeAccents(buffer, length);
          termAtt.copyBuffer(output, 0, outputPos);
          break;
        }
      }
      return true;
    } else
      return false;
  }

  /**
   * To replace accented characters in a String by unaccented equivalents.
   */
  public final void removeAccents(char[] input, int length) {

    // Worst-case length required:
    final int maxSizeNeeded = 2*length;

    int size = output.length;
    while (size < maxSizeNeeded)
      size *= 2;

    if (size != output.length)
      output = new char[size];

    outputPos = 0;

    int pos = 0;

    for (int i=0; i<length; i++, pos++) {
      final char c = input[pos];

      // Quick test: if it's not in range then just keep
      // current character
      if (c < '\u00c0' || c > '\uFB06')
        output[outputPos++] = c;
      else {
        switch (c) {
        case '\u00C0' : // À
        case '\u00C1' : // Á
        case '\u00C2' : // Â
        case '\u00C3' : // Ã
        case '\u00C4' : // Ä
        case '\u00C5' : // Å
          output[outputPos++] = 'A';
          break;
        case '\u00C6' : // Æ
          output[outputPos++] = 'A';
          output[outputPos++] = 'E';
          break;
        case '\u00C7' : // Ç
          output[outputPos++] = 'C';
          break;
        case '\u00C8' : // È
        case '\u00C9' : // É
        case '\u00CA' : // Ê
        case '\u00CB' : // Ë
          output[outputPos++] = 'E';
          break;
        case '\u00CC' : // Ì
        case '\u00CD' : // Í
        case '\u00CE' : // Î
        case '\u00CF' : // Ï
          output[outputPos++] = 'I';
          break;
        case '\u0132' : // Ĳ
            output[outputPos++] = 'I';
            output[outputPos++] = 'J';
            break;
        case '\u00D0' : // Ð
          output[outputPos++] = 'D';
          break;
        case '\u00D1' : // Ñ
          output[outputPos++] = 'N';
          break;
        case '\u00D2' : // Ò
        case '\u00D3' : // Ó
        case '\u00D4' : // Ô
        case '\u00D5' : // Õ
        case '\u00D6' : // Ö
        case '\u00D8' : // Ø
          output[outputPos++] = 'O';
          break;
        case '\u0152' : // Œ
          output[outputPos++] = 'O';
          output[outputPos++] = 'E';
          break;
        case '\u00DE' : // Þ
          output[outputPos++] = 'T';
          output[outputPos++] = 'H';
          break;
        case '\u00D9' : // Ù
        case '\u00DA' : // Ú
        case '\u00DB' : // Û
        case '\u00DC' : // Ü
          output[outputPos++] = 'U';
          break;
        case '\u00DD' : // Ý
        case '\u0178' : // Ÿ
          output[outputPos++] = 'Y';
          break;
        case '\u00E0' : // à
        case '\u00E1' : // á
        case '\u00E2' : // â
        case '\u00E3' : // ã
        case '\u00E4' : // ä
        case '\u00E5' : // å
          output[outputPos++] = 'a';
          break;
        case '\u00E6' : // æ
          output[outputPos++] = 'a';
          output[outputPos++] = 'e';
          break;
        case '\u00E7' : // ç
          output[outputPos++] = 'c';
          break;
        case '\u00E8' : // è
        case '\u00E9' : // é
        case '\u00EA' : // ê
        case '\u00EB' : // ë
          output[outputPos++] = 'e';
          break;
        case '\u00EC' : // ì
        case '\u00ED' : // í
        case '\u00EE' : // î
        case '\u00EF' : // ï
          output[outputPos++] = 'i';
          break;
        case '\u0133' : // ĳ
            output[outputPos++] = 'i';
            output[outputPos++] = 'j';
            break;
        case '\u00F0' : // ð
          output[outputPos++] = 'd';
          break;
        case '\u00F1' : // ñ
          output[outputPos++] = 'n';
          break;
        case '\u00F2' : // ò
        case '\u00F3' : // ó
        case '\u00F4' : // ô
        case '\u00F5' : // õ
        case '\u00F6' : // ö
        case '\u00F8' : // ø
          output[outputPos++] = 'o';
          break;
        case '\u0153' : // œ
          output[outputPos++] = 'o';
          output[outputPos++] = 'e';
          break;
        case '\u00DF' : // ß
          output[outputPos++] = 's';
          output[outputPos++] = 's';
          break;
        case '\u00FE' : // þ
          output[outputPos++] = 't';
          output[outputPos++] = 'h';
          break;
        case '\u00F9' : // ù
        case '\u00FA' : // ú
        case '\u00FB' : // û
        case '\u00FC' : // ü
          output[outputPos++] = 'u';
          break;
        case '\u00FD' : // ý
        case '\u00FF' : // ÿ
          output[outputPos++] = 'y';
          break;
        case '\uFB00': // ﬀ
            output[outputPos++] = 'f';
            output[outputPos++] = 'f';
            break;
        case '\uFB01': // ﬁ
            output[outputPos++] = 'f';
            output[outputPos++] = 'i';
            break;
        case '\uFB02': // ﬂ
            output[outputPos++] = 'f';
            output[outputPos++] = 'l';
            break;
        // following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive)
//        case '\uFB03': // ﬃ
//            output[outputPos++] = 'f';
//            output[outputPos++] = 'f';
//            output[outputPos++] = 'i';
//            break;
//        case '\uFB04': // ﬄ
//            output[outputPos++] = 'f';
//            output[outputPos++] = 'f';
//            output[outputPos++] = 'l';
//            break;
        case '\uFB05': // ﬅ
            output[outputPos++] = 'f';
            output[outputPos++] = 't';
            break;
        case '\uFB06': // ﬆ
            output[outputPos++] = 's';
            output[outputPos++] = 't';
          break;
        default :
          output[outputPos++] = c;
          break;
        }
      }
    }
  }
}