package org.apache.lucene.analysis.cz; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import static org.apache.lucene.analysis.util.StemmerUtil.*; /** * Light Stemmer for Czech. * <p> * Implements the algorithm described in: * <i> * Indexing and stemming approaches for the Czech language * </i> * http://portal.acm.org/citation.cfm?id=1598600 * </p> */ public class CzechStemmer { /** * Stem an input buffer of Czech text. * * @param s input buffer * @param len length of input buffer * @return length of input buffer after normalization * * <p><b>NOTE</b>: Input is expected to be in lowercase, * but with diacritical marks</p> */ public int stem(char s[], int len) { len = removeCase(s, len); len = removePossessives(s, len); len = normalize(s, len); return len; } private int removeCase(char s[], int len) { if (len > 7 && endsWith(s, len, "atech")) return len - 5; if (len > 6 && (endsWith(s, len,"ětem") || endsWith(s, len,"etem") || endsWith(s, len,"atům"))) return len - 4; if (len > 5 && (endsWith(s, len, "ech") || endsWith(s, len, "ich") || endsWith(s, len, "ích") || endsWith(s, len, "ého") || endsWith(s, len, "ěmi") || endsWith(s, len, "emi") || endsWith(s, len, "ému") || endsWith(s, len, "ěte") || endsWith(s, len, "ete") || endsWith(s, len, "ěti") || endsWith(s, len, "eti") || endsWith(s, len, "ího") || endsWith(s, len, "iho") || endsWith(s, len, "ími") || endsWith(s, len, "ímu") || endsWith(s, len, "imu") || endsWith(s, len, "ách") || endsWith(s, len, "ata") || endsWith(s, len, "aty") || endsWith(s, len, "ých") || endsWith(s, len, "ama") || endsWith(s, len, "ami") || endsWith(s, len, "ové") || endsWith(s, len, "ovi") || endsWith(s, len, "ými"))) return len - 3; if (len > 4 && (endsWith(s, len, "em") || endsWith(s, len, "es") || endsWith(s, len, "ém") || endsWith(s, len, "ím") || endsWith(s, len, "ům") || endsWith(s, len, "at") || endsWith(s, len, "ám") || endsWith(s, len, "os") || endsWith(s, len, "us") || endsWith(s, len, "ým") || endsWith(s, len, "mi") || endsWith(s, len, "ou"))) return len - 2; if (len > 3) { switch (s[len - 1]) { case 'a': case 'e': case 'i': case 'o': case 'u': case 'ů': case 'y': case 'á': case 'é': case 'í': case 'ý': case 'ě': return len - 1; } } return len; } private int removePossessives(char s[], int len) { if (len > 5 && (endsWith(s, len, "ov") || endsWith(s, len, "in") || endsWith(s, len, "ův"))) return len - 2; return len; } private int normalize(char s[], int len) { if (endsWith(s, len, "čt")) { // čt -> ck s[len - 2] = 'c'; s[len - 1] = 'k'; return len; } if (endsWith(s, len, "št")) { // št -> sk s[len - 2] = 's'; s[len - 1] = 'k'; return len; } switch(s[len - 1]) { case 'c': // [cč] -> k case 'č': s[len - 1] = 'k'; return len; case 'z': // [zž] -> h case 'ž': s[len - 1] = 'h'; return len; } if (len > 1 && s[len - 2] == 'e') { s[len - 2] = s[len - 1]; // e* > * return len - 1; } if (len > 2 && s[len - 2] == 'ů') { s[len - 2] = 'o'; // *ů* -> *o* return len; } return len; } }