package org.apache.lucene.analysis.ru; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description). * @deprecated Use {@link org.tartarus.snowball.ext.RussianStemmer} instead, * which has the same functionality. This filter will be removed in Lucene 4.0 */ @Deprecated class RussianStemmer { // positions of RV, R1 and R2 respectively private int RV, /*R1,*/ R2; // letters (currently unused letters are commented out) private final static char A = '\u0430'; //private final static char B = '\u0431'; private final static char V = '\u0432'; private final static char G = '\u0433'; //private final static char D = '\u0434'; private final static char E = '\u0435'; //private final static char ZH = '\u0436'; //private final static char Z = '\u0437'; private final static char I = '\u0438'; private final static char I_ = '\u0439'; //private final static char K = '\u043A'; private final static char L = '\u043B'; private final static char M = '\u043C'; private final static char N = '\u043D'; private final static char O = '\u043E'; //private final static char P = '\u043F'; //private final static char R = '\u0440'; private final static char S = '\u0441'; private final static char T = '\u0442'; private final static char U = '\u0443'; //private final static char F = '\u0444'; private final static char X = '\u0445'; //private final static char TS = '\u0446'; //private final static char CH = '\u0447'; private final static char SH = '\u0448'; private final static char SHCH = '\u0449'; //private final static char HARD = '\u044A'; private final static char Y = '\u044B'; private final static char SOFT = '\u044C'; private final static char AE = '\u044D'; private final static char IU = '\u044E'; private final static char IA = '\u044F'; // stem definitions private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA }; private static char[][] perfectiveGerundEndings1 = { { V }, { V, SH, I }, { V, SH, I, S, SOFT } }; private static char[][] perfectiveGerund1Predessors = { { A }, { IA } }; private static char[][] perfectiveGerundEndings2 = { { I, V }, { Y, V }, { I, V, SH, I }, { Y, V, SH, I }, { I, V, SH, I, S, SOFT }, { Y, V, SH, I, S, SOFT } }; private static char[][] adjectiveEndings = { { E, E }, { I, E }, { Y, E }, { O, E }, { E, I_ }, { I, I_ }, { Y, I_ }, { O, I_ }, { E, M }, { I, M }, { Y, M }, { O, M }, { I, X }, { Y, X }, { U, IU }, { IU, IU }, { A, IA }, { IA, IA }, { O, IU }, { E, IU }, { I, M, I }, { Y, M, I }, { E, G, O }, { O, G, O }, { E, M, U }, {O, M, U } }; private static char[][] participleEndings1 = { { SHCH }, { E, M }, { N, N }, { V, SH }, { IU, SHCH } }; private static char[][] participleEndings2 = { { I, V, SH }, { Y, V, SH }, { U, IU, SHCH } }; private static char[][] participle1Predessors = { { A }, { IA } }; private static char[][] reflexiveEndings = { { S, IA }, { S, SOFT } }; private static char[][] verbEndings1 = { { I_ }, { L }, { N }, { L, O }, { N, O }, { E, T }, { IU, T }, { L, A }, { N, A }, { L, I }, { E, M }, { N, Y }, { E, T, E }, { I_, T, E }, { T, SOFT }, { E, SH, SOFT }, { N, N, O } }; private static char[][] verbEndings2 = { { IU }, { U, IU }, { E, N }, { E, I_ }, { IA, T }, { U, I_ }, { I, L }, { Y, L }, { I, M }, { Y, M }, { I, T }, { Y, T }, { I, L, A }, { Y, L, A }, { E, N, A }, { I, T, E }, { I, L, I }, { Y, L, I }, { I, L, O }, { Y, L, O }, { E, N, O }, { U, E, T }, { U, IU, T }, { E, N, Y }, { I, T, SOFT }, { Y, T, SOFT }, { I, SH, SOFT }, { E, I_, T, E }, { U, I_, T, E } }; private static char[][] verb1Predessors = { { A }, { IA } }; private static char[][] nounEndings = { { A }, { U }, { I_ }, { O }, { U }, { E }, { Y }, { I }, { SOFT }, { IA }, { E, V }, { O, V }, { I, E }, { SOFT, E }, { IA, X }, { I, IU }, { E, I }, { I, I }, { E, I_ }, { O, I_ }, { E, M }, { A, M }, { O, M }, { A, X }, { SOFT, IU }, { I, IA }, { SOFT, IA }, { I, I_ }, { IA, M }, { IA, M, I }, { A, M, I }, { I, E, I_ }, { I, IA, M }, { I, E, M }, { I, IA, X }, { I, IA, M, I } }; private static char[][] superlativeEndings = { { E, I_, SH }, { E, I_, SH, E } }; private static char[][] derivationalEndings = { { O, S, T }, { O, S, T, SOFT } }; /** * RussianStemmer constructor comment. */ public RussianStemmer() { super(); } /** * Adjectival ending is an adjective ending, * optionally preceded by participle ending. * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuilder */ private boolean adjectival(StringBuilder stemmingZone) { // look for adjective ending in a stemming zone if (!findAndRemoveEnding(stemmingZone, adjectiveEndings)) return false; // if adjective ending was found, try for participle ending. if (!findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)) findAndRemoveEnding(stemmingZone, participleEndings2); return true; } /** * Derivational endings * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuilder */ private boolean derivational(StringBuilder stemmingZone) { int endingLength = findEnding(stemmingZone, derivationalEndings); if (endingLength == 0) // no derivational ending found return false; else { // Ensure that the ending locates in R2 if (R2 - RV <= stemmingZone.length() - endingLength) { stemmingZone.setLength(stemmingZone.length() - endingLength); return true; } else { return false; } } } /** * Finds ending among given ending class and returns the length of ending found(0, if not found). * Creation date: (17/03/2002 8:18:34 PM) */ private int findEnding(StringBuilder stemmingZone, int startIndex, char[][] theEndingClass) { boolean match = false; for (int i = theEndingClass.length - 1; i >= 0; i--) { char[] theEnding = theEndingClass[i]; // check if the ending is bigger than stemming zone if (startIndex < theEnding.length - 1) { match = false; continue; } match = true; int stemmingIndex = startIndex; for (int j = theEnding.length - 1; j >= 0; j--) { if (stemmingZone.charAt(stemmingIndex--) != theEnding[j]) { match = false; break; } } // check if ending was found if (match) { return theEndingClass[i].length; // cut ending } } return 0; } private int findEnding(StringBuilder stemmingZone, char[][] theEndingClass) { return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass); } /** * Finds the ending among the given class of endings and removes it from stemming zone. * Creation date: (17/03/2002 8:18:34 PM) */ private boolean findAndRemoveEnding(StringBuilder stemmingZone, char[][] theEndingClass) { int endingLength = findEnding(stemmingZone, theEndingClass); if (endingLength == 0) // not found return false; else { stemmingZone.setLength(stemmingZone.length() - endingLength); // cut the ending found return true; } } /** * Finds the ending among the given class of endings, then checks if this ending was * preceded by any of given predecessors, and if so, removes it from stemming zone. * Creation date: (17/03/2002 8:18:34 PM) */ private boolean findAndRemoveEnding(StringBuilder stemmingZone, char[][] theEndingClass, char[][] thePredessors) { int endingLength = findEnding(stemmingZone, theEndingClass); if (endingLength == 0) // not found return false; else { int predessorLength = findEnding(stemmingZone, stemmingZone.length() - endingLength - 1, thePredessors); if (predessorLength == 0) return false; else { stemmingZone.setLength(stemmingZone.length() - endingLength); // cut the ending found return true; } } } /** * Marks positions of RV, R1 and R2 in a given word. * Creation date: (16/03/2002 3:40:11 PM) */ private void markPositions(String word) { RV = 0; // R1 = 0; R2 = 0; int i = 0; // find RV while (word.length() > i && !isVowel(word.charAt(i))) { i++; } if (word.length() - 1 < ++i) return; // RV zone is empty RV = i; // find R1 while (word.length() > i && isVowel(word.charAt(i))) { i++; } if (word.length() - 1 < ++i) return; // R1 zone is empty // R1 = i; // find R2 while (word.length() > i && !isVowel(word.charAt(i))) { i++; } if (word.length() - 1 < ++i) return; // R2 zone is empty while (word.length() > i && isVowel(word.charAt(i))) { i++; } if (word.length() - 1 < ++i) return; // R2 zone is empty R2 = i; } /** * Checks if character is a vowel.. * Creation date: (16/03/2002 10:47:03 PM) * @return boolean * @param letter char */ private boolean isVowel(char letter) { for (int i = 0; i < vowels.length; i++) { if (letter == vowels[i]) return true; } return false; } /** * Noun endings. * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuilder */ private boolean noun(StringBuilder stemmingZone) { return findAndRemoveEnding(stemmingZone, nounEndings); } /** * Perfective gerund endings. * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuilder */ private boolean perfectiveGerund(StringBuilder stemmingZone) { return findAndRemoveEnding( stemmingZone, perfectiveGerundEndings1, perfectiveGerund1Predessors) || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2); } /** * Reflexive endings. * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuilder */ private boolean reflexive(StringBuilder stemmingZone) { return findAndRemoveEnding(stemmingZone, reflexiveEndings); } /** * Insert the method's description here. * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuilder */ private boolean removeI(StringBuilder stemmingZone) { if (stemmingZone.length() > 0 && stemmingZone.charAt(stemmingZone.length() - 1) == I) { stemmingZone.setLength(stemmingZone.length() - 1); return true; } else { return false; } } /** * Insert the method's description here. * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuilder */ private boolean removeSoft(StringBuilder stemmingZone) { if (stemmingZone.length() > 0 && stemmingZone.charAt(stemmingZone.length() - 1) == SOFT) { stemmingZone.setLength(stemmingZone.length() - 1); return true; } else { return false; } } /** * Finds the stem for given Russian word. * Creation date: (16/03/2002 3:36:48 PM) * @return java.lang.String * @param input java.lang.String */ public String stem(String input) { markPositions(input); if (RV == 0) return input; //RV wasn't detected, nothing to stem StringBuilder stemmingZone = new StringBuilder(input.substring(RV)); // stemming goes on in RV // Step 1 if (!perfectiveGerund(stemmingZone)) { reflexive(stemmingZone); if (!adjectival(stemmingZone)) if (!verb(stemmingZone)) noun(stemmingZone); } // Step 2 removeI(stemmingZone); // Step 3 derivational(stemmingZone); // Step 4 superlative(stemmingZone); undoubleN(stemmingZone); removeSoft(stemmingZone); // return result return input.substring(0, RV) + stemmingZone.toString(); } /** * Superlative endings. * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuilder */ private boolean superlative(StringBuilder stemmingZone) { return findAndRemoveEnding(stemmingZone, superlativeEndings); } /** * Undoubles N. * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuilder */ private boolean undoubleN(StringBuilder stemmingZone) { char[][] doubleN = { { N, N } }; if (findEnding(stemmingZone, doubleN) != 0) { stemmingZone.setLength(stemmingZone.length() - 1); return true; } else { return false; } } /** * Verb endings. * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuilder */ private boolean verb(StringBuilder stemmingZone) { return findAndRemoveEnding( stemmingZone, verbEndings1, verb1Predessors) || findAndRemoveEnding(stemmingZone, verbEndings2); } /** * Static method for stemming. */ public static String stemWord(String theWord) { RussianStemmer stemmer = new RussianStemmer(); return stemmer.stem(theWord); } }