package org.apache.lucene.analysis.miscellaneous; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.RamUsageEstimator; /** * * This is a modified ASCIIFoldingFilter * * It translates greek math symbols * * For example, 'γ' will be replaced by 'gamma'. */ public final class AdsSpecialCharactersFilter extends TokenFilter { public AdsSpecialCharactersFilter(TokenStream input) { super(input); } private char[] output = new char[512]; private int outputPos; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); @Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { final char[] buffer = termAtt.buffer(); final int length = termAtt.length(); // If no characters actually require rewriting then we // just return token as-is: for(int i = 0 ; i < length ; ++i) { final char c = buffer[i]; if (c >= '\u0080') { foldToASCII(buffer, length); termAtt.copyBuffer(output, 0, outputPos); break; } } return true; } else { return false; } } /** * Converts characters above ASCII to their ASCII equivalents. For example, * accents are removed from accented characters. * @param input The string to fold * @param length The number of characters in the input string */ public void foldToASCII(char[] input, int length) { // Worst-case length required: final int maxSizeNeeded = 4 * length; if (output.length < maxSizeNeeded) { output = new char[ArrayUtil.oversize(maxSizeNeeded, RamUsageEstimator.NUM_BYTES_CHAR)]; } outputPos = foldToASCII(input, 0, output, 0, length); } /** * Converts characters above ASCII to their ASCII equivalents. For example, * accents are removed from accented characters. * @param input The characters to fold * @param inputPos Index of the first character to fold * @param output The result of the folding. Should be of size >= {@code length * 4}. * @param outputPos Index of output where to put the result of the folding * @param length The number of characters to fold * @return length of output * @lucene.internal */ public static final int foldToASCII(char input[], int inputPos, char output[], int outputPos, int length) { final int end = inputPos + length; for (int pos = inputPos; pos < end ; ++pos) { final char c = input[pos]; // Quick test: if it's not in range then just keep current character if (c < '\u0080') { output[outputPos++] = c; } else { switch (c) { case '\u0391': case '\u03B1': output[outputPos++] = 'a'; output[outputPos++] = 'l'; output[outputPos++] = 'p'; output[outputPos++] = 'h'; output[outputPos++] = 'a'; break; case '\u0392': case '\u03B2': output[outputPos++] = 'b'; output[outputPos++] = 'e'; output[outputPos++] = 't'; output[outputPos++] = 'a'; break; case '\u0393': case '\u03B3': output[outputPos++] = 'g'; output[outputPos++] = 'a'; output[outputPos++] = 'm'; output[outputPos++] = 'm'; output[outputPos++] = 'a'; break; case '\u0394': case '\u03B4': output[outputPos++] = 'd'; output[outputPos++] = 'e'; output[outputPos++] = 'l'; output[outputPos++] = 't'; output[outputPos++] = 'a'; break; case '\u0395': case '\u03B5': output[outputPos++] = 'e'; output[outputPos++] = 'p'; output[outputPos++] = 's'; output[outputPos++] = 'i'; output[outputPos++] = 'l'; output[outputPos++] = 'o'; output[outputPos++] = 'n'; break; case '\u0396': case '\u03B6': output[outputPos++] = 'z'; output[outputPos++] = 'e'; output[outputPos++] = 't'; output[outputPos++] = 'a'; break; case '\u0397': case '\u03B7': output[outputPos++] = 'e'; output[outputPos++] = 't'; output[outputPos++] = 'a'; break; case '\u0398': case '\u03B8': output[outputPos++] = 't'; output[outputPos++] = 'h'; output[outputPos++] = 'e'; output[outputPos++] = 't'; output[outputPos++] = 'a'; break; case '\u0399': case '\u03B9': output[outputPos++] = 'i'; output[outputPos++] = 'o'; output[outputPos++] = 't'; output[outputPos++] = 'a'; break; case '\u039A': case '\u03BA': output[outputPos++] = 'k'; output[outputPos++] = 'a'; output[outputPos++] = 'p'; output[outputPos++] = 'p'; output[outputPos++] = 'a'; break; case '\u039B': case '\u03BB': output[outputPos++] = 'l'; output[outputPos++] = 'a'; output[outputPos++] = 'm'; output[outputPos++] = 'b'; output[outputPos++] = 'd'; output[outputPos++] = 'a'; break; case '\u039C': case '\u03BC': output[outputPos++] = 'm'; output[outputPos++] = 'u'; break; case '\u039D': case '\u03BD': output[outputPos++] = 'n'; output[outputPos++] = 'u'; break; case '\u039E': case '\u03BE': output[outputPos++] = 'x'; output[outputPos++] = 'i'; break; case '\u039F': case '\u03BF': output[outputPos++] = 'o'; output[outputPos++] = 'm'; output[outputPos++] = 'i'; output[outputPos++] = 'c'; output[outputPos++] = 'r'; output[outputPos++] = 'o'; output[outputPos++] = 'n'; break; case '\u03A0': case '\u03C0': output[outputPos++] = 'p'; output[outputPos++] = 'i'; break; case '\u03A1': case '\u03C1': output[outputPos++] = 'r'; output[outputPos++] = 'h'; output[outputPos++] = 'o'; break; case '\u03A3': case '\u03C3': output[outputPos++] = 's'; output[outputPos++] = 'i'; output[outputPos++] = 'g'; output[outputPos++] = 'm'; output[outputPos++] = 'a'; break; case '\u03A4': case '\u03C4': output[outputPos++] = 't'; output[outputPos++] = 'a'; output[outputPos++] = 'u'; break; case '\u03A5': case '\u03C5': output[outputPos++] = 'u'; output[outputPos++] = 'p'; output[outputPos++] = 's'; output[outputPos++] = 'i'; output[outputPos++] = 'l'; output[outputPos++] = 'o'; output[outputPos++] = 'n'; break; case '\u03A6': case '\u03C6': output[outputPos++] = 'p'; output[outputPos++] = 'h'; output[outputPos++] = 'i'; break; case '\u03A7': case '\u03C7': output[outputPos++] = 'c'; output[outputPos++] = 'h'; output[outputPos++] = 'i'; break; case '\u03A8': case '\u03C8': output[outputPos++] = 'p'; output[outputPos++] = 's'; output[outputPos++] = 'i'; break; case '\u03A9': case '\u03C9': output[outputPos++] = 'o'; output[outputPos++] = 'm'; output[outputPos++] = 'e'; output[outputPos++] = 'g'; output[outputPos++] = 'a'; break; default: output[outputPos++] = c; break; } } } return outputPos; } }