package; import java.util.Arrays; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.Version; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Minimal Stemmer for Portuguese * <p> * This follows the "RSLP-S" algorithm presented in: * <i>A study on the Use of Stemming for Monolingual Ad-Hoc Portuguese * Information Retrieval</i> (Orengo, et al) * which is just the plural reduction step of the RSLP * algorithm from <i>A Stemming Algorithmm for the Portuguese Language</i>, * Orengo et al. */ public class PortugueseMinimalStemmer { private static final CharArraySet excIS = new CharArraySet(Version.LUCENE_31, Arrays.asList("lápis", "cais", "mais", "crúcis", "biquínis", "pois", "depois","dois","leis"), false); private static final CharArraySet excS = new CharArraySet(Version.LUCENE_31, Arrays.asList("aliás", "pires", "lápis", "cais", "mais", "mas", "menos", "férias", "fezes", "pêsames", "crúcis", "gás", "atrás", "moisés", "através", "convés", "ês", "país", "após", "ambas", "ambos", "messias", "depois"), false); public int stem(char s[], int len) { if (len < 3 || s[len-1] != 's') return len; if (s[len-2] == 'n') { len--; s[len-1] = 'm'; return len; } if (len >= 6 && s[len-3] == 'õ' && s[len-2] == 'e') { len--; s[len-2] = 'ã'; s[len-1] = 'o'; return len; } if (len >= 4 && s[len-3] == 'ã' && s[len-2] == 'e') if (!(len == 4 && s[0] == 'm')) { len--; s[len-1] = 'o'; return len; } if (len >= 4 && s[len-2] == 'i') { if (s[len-3] == 'a') if (!(len == 4 && (s[0] == 'c' || s[0] == 'm'))) { len--; s[len-1] = 'l'; return len; } if (len >= 5 && s[len-3] == 'é') { len--; s[len-2] = 'e'; s[len-1] = 'l'; return len; } if (len >= 5 && s[len-3] == 'e') { len--; s[len-1] = 'l'; return len; } if (len >= 5 && s[len-3] == 'ó') { len--; s[len-2] = 'o'; s[len-1] = 'l'; return len; } if (!excIS.contains(s, 0, len)) { s[len-1] = 'l'; return len; } } if (len >= 6 && s[len-3] == 'l' && s[len-2] == 'e') return len - 2; if (len >= 6 && s[len-3] == 'r' && s[len-2] == 'e') if (!(len == 7 && s[0] == 'á' && s[1] == 'r' && s[2] == 'v' && s[3] == 'o')) return len - 2; if (excS.contains(s, 0, len)) return len; else return len-1; } }