/*
* File: PorterEnglishStemmingFilter.java
* Authors: Justin Basilico
* Company: Sandia National Laboratories
* Project: Cognitive Foundry
*
* Copyright January 27, 2010, Sandia Corporation.
* Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive
* license for use of this work by or on behalf of the U.S. Government. Export
* of this program may require a license from the United States Government.
* See CopyrightHistory.txt for complete details.
*
*/
package gov.sandia.cognition.text.term.filter.stem;
import gov.sandia.cognition.annotation.PublicationReference;
import gov.sandia.cognition.annotation.PublicationReferences;
import gov.sandia.cognition.annotation.PublicationType;
import gov.sandia.cognition.text.term.DefaultTerm;
import gov.sandia.cognition.text.term.DefaultTermOccurrence;
import gov.sandia.cognition.text.term.TermOccurrence;
import gov.sandia.cognition.text.term.filter.AbstractSingleTermFilter;
import org.tartarus.martin.Stemmer;
/**
* A term filter that uses the Porter Stemming algorithm. It is a rule-based
* algorithm for stemming English words. This class just wraps the Java
* implementation of the stemmer by Martin Porter himself and turns it into
* a TermFilter.
*
* @author Justin Basilico
* @since 3.0
*/
@PublicationReferences(
references={
@PublicationReference(
author="Martin Porter",
title="The Porter Stemming Algorithm",
year=2006,
type=PublicationType.WebPage,
url="http://tartarus.org/~martin/PorterStemmer/"
),
@PublicationReference(
author="Martin F. Porter",
title=" An algorithm for suffix stripping",
year=1980,
publication="Program 14(3)",
pages={130, 137},
type=PublicationType.Journal
),
@PublicationReference(
author="Wikipedia",
title="Stemming",
year=2010,
type=PublicationType.WebPage,
url="http://en.wikipedia.org/wiki/Stemming"
)
}
)
public class PorterEnglishStemmingFilter
extends AbstractSingleTermFilter
{
/**
* Creates a new {@code PorterEnglishStemmingFilter}.
*/
public PorterEnglishStemmingFilter()
{
super();
}
public TermOccurrence filterTerm(
final TermOccurrence occurrence)
{
// Get the old text.
final String oldText = occurrence.getTerm().getName();
// Create the stemmer and apply it.
final Stemmer stemmer = new Stemmer();
stemmer.add(oldText.toCharArray(), oldText.length());
stemmer.stem();
final String newText = stemmer.toString();
// Create the new term and add it to the result.
final DefaultTerm newTerm = new DefaultTerm(newText);
final DefaultTermOccurrence newOccurrence = new DefaultTermOccurrence(
newTerm, occurrence.getStart(), occurrence.getLength());
return newOccurrence;
}
/**
* Stems the given String according to the Porter stemming algorithm for
* English words.
*
* @param word
* The word to stem.
* @return
* The stemmed version of the given word.
*/
public static String stem(
final String word)
{
// Create the stemmer.
final Stemmer stemmer = new Stemmer();
// Add the word.
stemmer.add(word.toCharArray(), word.length());
// Stem the word.
stemmer.stem();
// Return the stem.
return stemmer.toString();
}
}