PortugueseMinimalStemmer.java example

Explorer
solrcene-master
package org.apache.lucene.analysis.pt;

import java.util.Arrays;

import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * Minimal Stemmer for Portuguese
 * <p>
 * This follows the "RSLP-S" algorithm presented in:
 * <i>A study on the Use of Stemming for Monolingual Ad-Hoc Portuguese
 * Information Retrieval</i> (Orengo, et al)
 * which is just the plural reduction step of the RSLP
 * algorithm from <i>A Stemming Algorithmm for the Portuguese Language</i>,
 * Orengo et al.
 */
public class PortugueseMinimalStemmer {
  
  private static final CharArraySet excIS = new CharArraySet(Version.LUCENE_31,
      Arrays.asList("lápis", "cais", "mais", "crúcis", "biquínis", "pois", 
          "depois","dois","leis"),
      false);
  
  private static final CharArraySet excS = new CharArraySet(Version.LUCENE_31,
      Arrays.asList("aliás", "pires", "lápis", "cais", "mais", "mas", "menos",
          "férias", "fezes", "pêsames", "crúcis", "gás", "atrás", "moisés",
          "através", "convés", "ês", "país", "após", "ambas", "ambos",
          "messias", "depois"), 
      false);
  
  public int stem(char s[], int len) {
    if (len < 3 || s[len-1] != 's')
      return len;
    
    if (s[len-2] == 'n') {
      len--;
      s[len-1] = 'm';
      return len;
    }
    
    if (len >= 6 && s[len-3] == 'õ' && s[len-2] == 'e') {
      len--;
      s[len-2] = 'ã';
      s[len-1] = 'o';
      return len;
    }
      
    if (len >= 4 && s[len-3] == 'ã' && s[len-2] == 'e')
      if (!(len == 4 && s[0] == 'm')) {
        len--;
        s[len-1] = 'o';
        return len;
      }
    
    if (len >= 4 && s[len-2] == 'i') {
      if (s[len-3] == 'a')
        if (!(len == 4 && (s[0] == 'c' || s[0] == 'm'))) {
          len--;
          s[len-1] = 'l';
          return len;
        }
   
      if (len >= 5 && s[len-3] == 'é') {
        len--;
        s[len-2] = 'e';
        s[len-1] = 'l';
        return len;
      }
    
      if (len >= 5 && s[len-3] == 'e') {
        len--;
        s[len-1] = 'l';
        return len;
      }
    
      if (len >= 5 && s[len-3] == 'ó') {
        len--;
        s[len-2] = 'o';
        s[len-1] = 'l';
        return len;
      }
  
      if (!excIS.contains(s, 0, len)) {
        s[len-1] = 'l';
        return len;
      }
    }
    
    if (len >= 6 && s[len-3] == 'l' && s[len-2] == 'e')
      return len - 2;
    
    if (len >= 6 && s[len-3] == 'r' && s[len-2] == 'e')
      if (!(len == 7 && s[0] == 'á' && s[1] == 'r' && s[2] == 'v' && s[3] == 'o'))
        return len - 2;
      
    if (excS.contains(s, 0, len))
      return len;
    else
      return len-1;
  }
}