package org.apache.lucene.analysis.tr; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.tr.util.PatternTableFactory; import org.apache.lucene.analysis.CharArrayMap; import java.io.IOException; import java.util.Arrays; import static org.apache.lucene.analysis.tr.util.PatternTableFactory.*; /** * Translation of <a href="https://github.com/emres/turkish-deasciifier">Turkish Deasciifier</a> from Lisp into Java */ public final class TurkishDeASCIIfyFilter extends TokenFilter { private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncAttr = addAttribute(PositionIncrementAttribute.class); private final boolean preserveOriginal; private State state; public TurkishDeASCIIfyFilter(TokenStream input, boolean preserveOriginal) { super(input); this.preserveOriginal = preserveOriginal; } /** * Determine if char at cursor needs correction. */ private static boolean turkish_need_correction(char c, int point, char[] turkish_string, int length) { final Character tr; if (turkish_asciify_table.containsKey(c)) tr = turkish_asciify_table.get(c); else tr = c; CharArrayMap<Integer> pl = PatternTableFactory.getMap(Character.toLowerCase(tr)); boolean m = false; if (pl != null) { m = turkish_match_pattern(pl, point, turkish_string, length); } if (tr.equals('I')) { if (c == tr) { return !m; } else { return m; } } else { if (c == tr) { return m; } else { return !m; } } } private static char[] turkish_get_context(int size, int point, char[] turkish_string, int length) { char[] s = new char[1 + (2 * size)]; Arrays.fill(s, ' '); s[size] = 'X'; int i = size + 1; boolean space = false; int index = point; index++; char current_char; while (i < s.length && !space && index < length) { current_char = turkish_string[index]; Character x = turkish_downcase_asciify_table.get(current_char); if (x == null) { i++; space = true; } else { s[i] = x; i++; space = false; } index++; } /* System.out.println("before "); System.out.println(s.length); System.out.println(s); System.out.println(i); */ System.arraycopy(s, 0, s, 0, i); /* System.out.println("after "); System.out.println(s); System.out.println(s.length); */ index = point; i = size - 1; space = false; index--; while (i >= 0 && index >= 0) { current_char = turkish_string[index]; Character x = turkish_upcase_accents_table.get(current_char); if (x == null) { if (!space) { i--; space = true; } } else { s[i] = x; i--; space = false; } index--; } //System.out.println("return"); //System.out.println(s); return s; } private static boolean turkish_match_pattern(CharArrayMap<Integer> dlist, int point, char[] turkish_string, int length) { final int turkish_context_size = 10; int rank = dlist.size() * 2; char[] str = turkish_get_context(turkish_context_size, point, turkish_string, length); //System.out.println("length = " + str.length); int start = 0; int end; int _len = str.length; while (start <= turkish_context_size) { end = turkish_context_size + 1; while (end <= _len) { Integer r = dlist.get(str, start, end - start); if (r != null && Math.abs(r) < Math.abs(rank)) { rank = r; } end++; } start++; } return rank > 0; } /** * Adds necessary accents to the words in the region. */ public static String convert_to_turkish(char[] turkish_string) { for (int i = 0; i < turkish_string.length; i++) { char c = turkish_string[i]; if (turkish_toggle_accent_table.containsKey(c)) { if (turkish_need_correction(c, i, turkish_string, turkish_string.length)) { turkish_string[i] = turkish_toggle_accent_table.get(c); } } } return new String(turkish_string); } /** * Adds necessary accents to the words in the region. */ public boolean convert_to_turkish(char[] turkish_string, int length) { boolean returnValue = false; boolean flag = true; for (int i = 0; i < length; i++) { char c = turkish_string[i]; if (turkish_toggle_accent_table.containsKey(c)) { if (turkish_need_correction(c, i, turkish_string, length)) { /** works only once **/ if (flag && preserveOriginal) { // we are about to make a change // capture original state state = captureState(); flag = false; } turkish_string[i] = turkish_toggle_accent_table.get(c); returnValue = true; } } } return returnValue; } @Override public boolean incrementToken() throws IOException { if (state != null) { assert preserveOriginal : "state should only be captured if preserveOriginal is true"; restoreState(state); posIncAttr.setPositionIncrement(0); state = null; return true; } if (input.incrementToken()) { final char[] buffer = termAtt.buffer(); final int length = termAtt.length(); if (convert_to_turkish(buffer, length)) typeAtt.setType(Zemberek2DeASCIIfyFilterFactory.DEASCII_TOKEN_TYPE); return true; } else { return false; } } @Override public void reset() throws IOException { super.reset(); state = null; } }