RuleBasedPluralizer.java example

Explorer
morphadorner-opensource-master
- src
package edu.northwestern.at.utils.corpuslinguistics.inflector.pluralizer;

/*	Please see the license information in the header below. */

import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.northwestern.at.utils.*;
import edu.northwestern.at.utils.corpuslinguistics.inflector.wordrule.*;

/** A {@link Pluralizer} implemented using an ordered list of {@link edu.northwestern.at.utils.corpuslinguistics.inflector.wordrule.WordRule}s.
 *
 *	<p>
 *	You may specify a fallback {@link Pluralizer} that is invoked when
 *	none of the rules match.  This allows you to override some rules of
 *	another {@link Pluralizer}.
 *	</p>
 *
 *	<p>
 *	This class preserves leading and trailing whitespace, so individual
 *	rules don't need to explicitly handle whitespace.
 *	</p>
 *
 *	<p>
 *	Case is also preserved -- that is, the output of all uppercase input
 *	is automatically uppercased, and the output of titlecase input is
 *	automatically titlecased.  This means rules can act in a
 *	case-insensitive manner.
 *	</p>
 *
 *	<p>
 *	Original code written by Tom White under the Apache v2 license.
 *	Modified by Philip R. Burns for integration into MorphAdorner.
 *	</p>
 */

public class RuleBasedPluralizer
	implements Pluralizer
{
	protected static final Pluralizer NOOP_PLURALIZER = new NoopPluralizer();

	protected List<WordRule> rules;
	protected Locale locale;
	protected Pluralizer fallbackPluralizer;

	/** Construct a pluralizer with an empty list of rules.
	 */

	public RuleBasedPluralizer()
	{
		this.rules	= ListFactory.createNewList();
		this.locale	= Locale.getDefault();
	}

	/** Construct a pluralizer with a list of rules.
	 *
	 *	@param	rules	The rules to apply, in order.
	 *	@param	locale	The locale specifying the language of the pluralizer.
	 *
	 *	<p>
	 *	A noop pluralizer is used as the fall back pluralizer when none of
	 *	the specified rules applies.
	 *	</p>
	 */

	public RuleBasedPluralizer( List<WordRule> rules , Locale locale )
	{
		this( rules , locale , NOOP_PLURALIZER );
	}

	/** Construct a pluralizer with a list of rules and a backup pluralizer.
	 *
	 *	@param	rules	The rules to apply, in order.
	 *	@param	locale	The locale specifying the language of the pluralizer.
	 *	@param	fallbackPluralizer	The pluralizer to use when no rules match.
	 *
	 *	<p>
	 *	The fall back pluralizer is invoked when none of the specified rules
	 *	applies.
	 *	</p>
	 */

	public RuleBasedPluralizer
	(
		List<WordRule> rules ,
		Locale locale ,
		Pluralizer fallbackPluralizer
	)
	{
		this.rules				= rules;
		this.locale				= locale;
		this.fallbackPluralizer	= fallbackPluralizer;
	}

	/** Get fall back pluralizer.
	 *
	 *	@return		The fall back pluralizer.
	 */

	public Pluralizer getFallbackPluralizer()
	{
		return fallbackPluralizer;
	}

	/** Set the fall back pluralizer.
	 *
	 *	@param	fallbackPluralizer	The fall back pluralizer.
	 */

	public void setFallbackPluralizer( Pluralizer fallbackPluralizer )
	{
		this.fallbackPluralizer = fallbackPluralizer;
	}

	/** Get the locale.
	 *
	 *	@return		The pluralizer locale.
	 */

	public Locale getLocale()
	{
		return locale;
	}

	/** Set the pluralizer locale.
	 *
	 *	@param	locale	The pluralizer locale.
	 */

	public void setLocale( Locale locale )
	{
		this.locale = locale;
	}

	/** Get the pluralizer rules.
	 *
	 *	@return		The pluralizer rules.
	 */

	public List<WordRule> getRules()
	{
		return rules;
	}

	/** Set the pluralizer rules.
	 *
	 *	@param	rules	The pluralizer rules.
	 */

	public void setRules( List<WordRule> rules )
	{
		this.rules = rules;
	}

	/** Pluralize a noun or pronoun.
	 *
	 *	@param	nounOrPronoun	The singular form of the noun or pronoun.
	 *
	 *	@return					The plural form of the noun or pronoun.
	 */

	public String pluralize( String nounOrPronoun )
	{
		return pluralize( nounOrPronoun , 2 );
	}

	/** Pluralize a noun or pronoun.
	 *
	 *	@param	nounOrPronoun	The singular form of the noun or pronoun.
	 *	@param	number			The number for the noun or pronoun.
	 *
	 *	@return			The form of the noun or pronoun for the specified
	 *					number.
	 */

	public String pluralize( String nounOrPronoun , int number )
	{
		if ( number == 1 )
		{
			return nounOrPronoun;
		}

		Pattern pattern = Pattern.compile( "\\A(\\s*)(.+?)(\\s*)\\Z" );
		Matcher matcher = pattern.matcher( nounOrPronoun );

		if ( matcher.matches() )
		{
			String pre			= matcher.group( 1 );
			String trimmedWord	= matcher.group( 2 );
			String post			= matcher.group( 3 );
			String plural		= pluralizeInternal( trimmedWord );

			if ( plural == null )
			{
				return fallbackPluralizer.pluralize( nounOrPronoun , number );
			}

			return pre + postProcess( trimmedWord , plural ) + post;
		}

		return nounOrPronoun;
	}

	/** Apply list of rules to a noun or pronoun.
	 *
	 *	@param	nounOrPronoun	Singular noun or pronoun.
	 *
	 *	@return		Plural form of the noun or pronoun, or <code>null</code>
	 *				when no rule matches.
	 */

	protected String pluralizeInternal( String nounOrPronoun )
	{
		for ( WordRule rule : rules )
		{
			if ( rule.applies( nounOrPronoun ) )
			{
				return rule.apply( nounOrPronoun );
			}
		}

		return null;
	}

	/** Fix case of pluralized word.
	 *
	 *	@param	trimmedWord		The input word, with leading and trailing
	 *							whitespace removed.
	 *	@param	pluralizedWord	The pluralized word.
	 *
	 *	@return					The <code>pluralizedWord</code> after
	 *							processing.
	 *
	 *	<p>
	 *	If <code>trimmedWord</code> is all uppercase, then
	 *	<code>pluralizedWord</code> is uppercased.
	 *	If <code>trimmedWord</code> is titlecase, then
	 *	<code>pluralizedWord</code> is titlecased.
	 *	</p>
	 */

	protected String postProcess
	(
		String trimmedWord ,
		String pluralizedWord
	)
	{
		if ( trimmedWord.matches( "^\\p{Lu}+$" ) )
		{
			return pluralizedWord.toUpperCase( locale );
		}
		else if ( trimmedWord.matches( "^\\p{Lu}.*" ) )
		{
			return pluralizedWord.substring( 0 , 1 ).toUpperCase( locale ) +
				pluralizedWord.substring( 1 );
		}

		return pluralizedWord;
	}
}