HunspellStemmer.java example

Explorer
solr-analytics-master
- lucene
- solr
package org.apache.lucene.analysis.hunspell;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Scanner;

import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;

/**
 * HunspellStemmer uses the affix rules declared in the HunspellDictionary to generate one or more stems for a word.  It
 * conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
 */
public class HunspellStemmer {

  private static final int RECURSION_CAP = 2;
  
  private final HunspellDictionary dictionary;
  private final StringBuilder segment = new StringBuilder();
  private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_40);

  /**
   * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
   *
   * @param dictionary HunspellDictionary that will be used to create the stems
   */
  public HunspellStemmer(HunspellDictionary dictionary) {
    this.dictionary = dictionary;
  }

  /**
   * Find the stem(s) of the provided word
   * 
   * @param word Word to find the stems for
   * @return List of stems for the word
   */
  public List<Stem> stem(String word) {
    return stem(word.toCharArray(), word.length());
  }

  /**
   * Find the stem(s) of the provided word
   * 
   * @param word Word to find the stems for
   * @return List of stems for the word
   */
  public List<Stem> stem(char word[], int length) {
    List<Stem> stems = new ArrayList<Stem>();
    if (dictionary.lookupWord(word, 0, length) != null) {
      stems.add(new Stem(word, length));
    }
    stems.addAll(stem(word, length, null, 0));
    return stems;
  }
  
  /**
   * Find the unique stem(s) of the provided word
   * 
   * @param word Word to find the stems for
   * @return List of stems for the word
   */
  public List<Stem> uniqueStems(char word[], int length) {
    List<Stem> stems = new ArrayList<Stem>();
    CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, dictionary.isIgnoreCase());
    if (dictionary.lookupWord(word, 0, length) != null) {
      stems.add(new Stem(word, length));
      terms.add(word);
    }
    List<Stem> otherStems = stem(word, length, null, 0);
    for (Stem s : otherStems) {
      if (!terms.contains(s.stem)) {
        stems.add(s);
        terms.add(s.stem);
      }
    }
    return stems;
  }

  // ================================================= Helper Methods ================================================

  /**
   * Generates a list of stems for the provided word
   *
   * @param word Word to generate the stems for
   * @param flags Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step
   * @param recursionDepth Level of recursion this stemming step is at
   * @return List of stems, pr an empty if no stems are found
   */
  private List<Stem> stem(char word[], int length, char[] flags, int recursionDepth) {
    List<Stem> stems = new ArrayList<Stem>();

    for (int i = 0; i < length; i++) {
      List<HunspellAffix> suffixes = dictionary.lookupSuffix(word, i, length - i);
      if (suffixes == null) {
        continue;
      }

      for (HunspellAffix suffix : suffixes) {
        if (hasCrossCheckedFlag(suffix.getFlag(), flags)) {
          int deAffixedLength = length - suffix.getAppend().length();
          // TODO: can we do this in-place?
          String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();

          List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
          for (Stem stem : stemList) {
            stem.addSuffix(suffix);
          }

          stems.addAll(stemList);
        }
      }
    }

    for (int i = length - 1; i >= 0; i--) {
      List<HunspellAffix> prefixes = dictionary.lookupPrefix(word, 0, i);
      if (prefixes == null) {
        continue;
      }

      for (HunspellAffix prefix : prefixes) {
        if (hasCrossCheckedFlag(prefix.getFlag(), flags)) {
          int deAffixedStart = prefix.getAppend().length();
          int deAffixedLength = length - deAffixedStart;

          String strippedWord = new StringBuilder().append(prefix.getStrip())
              .append(word, deAffixedStart, deAffixedLength)
              .toString();

          List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
          for (Stem stem : stemList) {
            stem.addPrefix(prefix);
          }

          stems.addAll(stemList);
        }
      }
    }

    return stems;
  }

  /**
   * Applies the affix rule to the given word, producing a list of stems if any are found
   *
   * @param strippedWord Word the affix has been removed and the strip added
   * @param affix HunspellAffix representing the affix rule itself
   * @param recursionDepth Level of recursion this stemming step is at
   * @return List of stems for the word, or an empty list if none are found
   */
  @SuppressWarnings("unchecked")
  public List<Stem> applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) {
    if(dictionary.isIgnoreCase()) {
      for(int i=0;i<strippedWord.length;){
        i += Character.toChars(
              Character.toLowerCase(charUtils.codePointAt(strippedWord, i)), strippedWord, i);
      }
    }
    segment.setLength(0);
    segment.append(strippedWord, 0, length);
    if (!affix.checkCondition(segment)) {
      return Collections.EMPTY_LIST;
    }

    List<Stem> stems = new ArrayList<Stem>();

    List<HunspellWord> words = dictionary.lookupWord(strippedWord, 0, length);
    if (words != null) {
      for (HunspellWord hunspellWord : words) {
        if (hunspellWord.hasFlag(affix.getFlag())) {
          stems.add(new Stem(strippedWord, length));
        }
      }
    }

    if (affix.isCrossProduct() && recursionDepth < RECURSION_CAP) {
      stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth));
    }

    return stems;
  }

  /**
   * Checks if the given flag cross checks with the given array of flags
   *
   * @param flag Flag to cross check with the array of flags
   * @param flags Array of flags to cross check against.  Can be {@code null}
   * @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise
   */
  private boolean hasCrossCheckedFlag(char flag, char[] flags) {
    return flags == null || Arrays.binarySearch(flags, flag) >= 0;
  }

  /**
   * Stem represents all information known about a stem of a word.  This includes the stem, and the prefixes and suffixes
   * that were used to change the word into the stem.
   */
  public static class Stem {

    private final List<HunspellAffix> prefixes = new ArrayList<HunspellAffix>();
    private final List<HunspellAffix> suffixes = new ArrayList<HunspellAffix>();
    private final char stem[];
    private final int stemLength;

    /**
     * Creates a new Stem wrapping the given word stem
     *
     * @param stem Stem of a word
     */
    public Stem(char stem[], int stemLength) {
      this.stem = stem;
      this.stemLength = stemLength;
    }

    /**
     * Adds a prefix to the list of prefixes used to generate this stem.  Because it is assumed that prefixes are added
     * depth first, the prefix is added to the front of the list
     *
     * @param prefix Prefix to add to the list of prefixes for this stem
     */
    public void addPrefix(HunspellAffix prefix) {
      prefixes.add(0, prefix);
    }

    /**
     * Adds a suffix to the list of suffixes used to generate this stem.  Because it is assumed that suffixes are added
     * depth first, the suffix is added to the end of the list
     *
     * @param suffix Suffix to add to the list of suffixes for this stem
     */
    public void addSuffix(HunspellAffix suffix) {
      suffixes.add(suffix);
    }

    /**
     * Returns the list of prefixes used to generate the stem
     *
     * @return List of prefixes used to generate the stem or an empty list if no prefixes were required
     */
    public List<HunspellAffix> getPrefixes() {
      return prefixes;
    }

    /**
     * Returns the list of suffixes used to generate the stem
     *
     * @return List of suffixes used to generate the stem or an empty list if no suffixes were required
     */
    public List<HunspellAffix> getSuffixes() {
      return suffixes;
    }

    /**
     * Returns the actual word stem itself
     *
     * @return Word stem itself
     */
    public char[] getStem() {
      return stem;
    }

    /**
     * @return the stemLength
     */
    public int getStemLength() {
      return stemLength;
    }
    
    public String getStemString() {
      return new String(stem, 0, stemLength);
    }
    
  }


  // ================================================= Entry Point ===================================================

  /*
   * HunspellStemmer entry point.  Accepts two arguments: location of affix file and location of dic file
   *
   * @param args Program arguments.  Should contain location of affix file and location of dic file
   * @throws IOException Can be thrown while reading from the files
   * @throws ParseException Can be thrown while parsing the files
  public static void main(String[] args) throws IOException, ParseException {
    boolean ignoreCase = false;
    int offset = 0;
    
    if (args.length < 2) {
      System.out.println("usage: HunspellStemmer [-i] <affix location> <dic location>");
      System.exit(1);
    }

    if(args[offset].equals("-i")) {
      ignoreCase = true;
      System.out.println("Ignoring case. All stems will be returned lowercased");
      offset++;
    }
    
    InputStream affixInputStream = new FileInputStream(args[offset++]);
    InputStream dicInputStream = new FileInputStream(args[offset++]);

    HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_40, ignoreCase);

    affixInputStream.close();
    dicInputStream.close();
    
    HunspellStemmer stemmer = new HunspellStemmer(dictionary);

    Scanner scanner = new Scanner(System.in, Charset.defaultCharset().name());
    
    System.out.print("> ");
    while (scanner.hasNextLine()) {
      String word = scanner.nextLine();
      
      if ("exit".equals(word)) {
        break;
      }

      printStemResults(word, stemmer.stem(word.toCharArray(), word.length()));
      
      System.out.print("> ");
    }
  }

   * Prints the results of the stemming of a word
   *
   * @param originalWord Word that has been stemmed
   * @param stems Stems of the word
  private static void printStemResults(String originalWord, List<Stem> stems) {
    StringBuilder builder = new StringBuilder().append("stem(").append(originalWord).append(")").append("\n");

    for (Stem stem : stems) {
      builder.append("- ").append(stem.getStem()).append(": ");

      for (HunspellAffix prefix : stem.getPrefixes()) {
        builder.append(prefix.getAppend()).append("+");

        if (hasText(prefix.getStrip())) {
          builder.append(prefix.getStrip()).append("-");
        }
      }

      builder.append(stem.getStem());

      for (HunspellAffix suffix : stem.getSuffixes()) {
        if (hasText(suffix.getStrip())) {
          builder.append("-").append(suffix.getStrip());
        }
        
        builder.append("+").append(suffix.getAppend());
      }
      builder.append("\n");
    }

    System.out.println(builder);
  }

   * Simple utility to check if the given String has any text
   *
   * @param str String to check if it has any text
   * @return {@code true} if the String has text, {@code false} otherwise
  private static boolean hasText(String str) {
    return str != null && str.length() > 0;
  }
  */
}