AbstractSpellingStandardizer.java example

Explorer
morphadorner-opensource-master
- src
package edu.northwestern.at.utils.corpuslinguistics.spellingstandardizer;

/*	Please see the license information at the end of this file. */

import java.io.*;
import java.net.*;
import java.util.*;

import edu.northwestern.at.utils.*;
import edu.northwestern.at.utils.corpuslinguistics.lexicon.*;
import edu.northwestern.at.utils.logger.*;

/**	Abstract Spelling Standardizer.
 */

abstract public class AbstractSpellingStandardizer
	extends IsCloseableObject
	implements SpellingStandardizer, UsesLogger
{
	/**	The map with alternate spellings as keys and standard spellings
	 *	as values.
	 */

	protected TaggedStrings mappedSpellings	= null;

	/**	The set of standard spellings. */

	protected Set<String> standardSpellingSet	=
		SetFactory.createNewSet();

	/**	Irregular forms.
	 *
	 *	<p>
	 *	Spellings disambiguated by word class are stored in a HashMap2D.
	 *	The compound key consists of the word class and alternate spelling,
	 *	and the value is the standardized spelling.
	 *	</p>
	 */

	protected Map2D<String, String, String> spellingsByWordClass;

	/**	Word classes of alternate spellings.
	 */

	protected Set<String> alternateSpellingsWordClasses;

	/**	Path to list of irregular word forms. */

	protected static String defaultSpellingsByWordClassFileName =
		"resources/spellingsbywordclass.txt";

	/**	Logger used for output. */

	protected Logger logger	= new DummyLogger();

	/**	Lexicon associated with this standardizer.  May be null. */

	protected Lexicon lexicon;

	/**	Create abstract spelling standardizer. */

	public AbstractSpellingStandardizer()
	{
								//	Load default spellings by word class.
    	try
		{
			loadAlternativeSpellingsByWordClass
			(
				this.getClass().getResource
				(
					defaultSpellingsByWordClassFileName
				) ,
				"utf-8"
			);
		}
		catch ( Exception e )
		{
		}
	}

	/**	Load alternate to standard spellings by word class.
	 *
	 *	@param	spellingsURL	URL of alternative spellings by word class.
	 */

	public void loadAlternativeSpellingsByWordClass
	(
		URL spellingsURL ,
	 	String encoding
	)
		throws IOException
	{
		String line = null;
								//	Load irregular forms.

		BufferedReader buffer =
			new BufferedReader
			(
				new UnicodeReader
				(
					spellingsURL.openStream() ,
					encoding
				)
			);

		String wordClass	= "";
		String spelling		= "";

		String[] tokens		= new String[ 2 ];

		spellingsByWordClass			= Map2DFactory.createNewMap2D();
		alternateSpellingsWordClasses	= new TreeSet<String>();

		while ( ( line = buffer.readLine() ) != null )
		{
			tokens	= StringUtils.makeTokenArray( line );

			if ( tokens.length > 0 )
			{
				int l	= tokens[ 0 ].length();

				if ( tokens[ 0 ].charAt( l - 1 ) == ':' )
				{
					wordClass	= tokens[ 0 ].substring( 0 , l - 1 );

					alternateSpellingsWordClasses.add( wordClass );
				}
				else
				{
					if ( tokens.length > 1 )
					{
						spelling	= tokens[ 1 ];
					}
					else
					{
						spelling	= tokens[ 0 ];
					}

					spellingsByWordClass.put(
						wordClass , tokens[ 0 ] , spelling );

					if ( tokens[ 0 ].indexOf( "^" ) >= 0 )
					{
						addMappedSpelling
						(
							StringUtils.replaceAll
							(
								tokens[ 0 ] ,
								"^" ,
								CharUtils.CHAR_SUP_TEXT_MARKER_STRING
							) ,
							spelling
						);
					}
				}
			}
		}

		buffer.close();
	}

	/**	Loads alternate spellings from a URL.
	 *
	 *	@param	url			URL containing alternate spellings to
	 *						standard spellings mappings.
	 *	@param	encoding	Text encoding (utf-8, 8859_1, etc.).
	 *	@param	delimChars	Delimiter characters separating spelling pairs.
	 */

	public void loadAlternativeSpellings
	(
		URL url ,
		String encoding ,
		String delimChars
	)
		throws IOException
	{
		if ( url != null )
		{
			loadAlternativeSpellings
			(
				new UnicodeReader( url.openStream() , encoding ) ,
				delimChars
			);
		}
	}

	/**	Loads alternative spellings from a reader.
	 *
	 *	@param	reader		The reader.
	 *	@param	delimChars	Delimiter characters separating spelling pairs.
	 */

	public void loadAlternativeSpellings
	(
		Reader reader ,
		String delimChars
	)
		throws IOException
	{
		String[] tokens;

		BufferedReader bufferedReader	= new BufferedReader( reader );

		if ( mappedSpellings == null )
		{
			mappedSpellings		= new TernaryTrie();
		}

		String inputLine	= bufferedReader.readLine();

		while ( inputLine != null )
		{
			tokens		= inputLine.split( delimChars );

			if ( tokens.length > 1 )
			{
				tokens[ 0 ]	= tokens[ 0 ].trim();
				tokens[ 1 ]	= tokens[ 1 ].trim();

				addMappedSpelling( tokens[ 0 ] , tokens[ 1 ] );

				if ( tokens[ 0 ].indexOf( "^" ) >= 0 )
				{
					addMappedSpelling
					(
						StringUtils.replaceAll
						(
							tokens[ 0 ] ,
							"^" ,
							CharUtils.CHAR_SUP_TEXT_MARKER_STRING
						) ,
						tokens[ 1 ]
					);
				}
        	}

            inputLine	= bufferedReader.readLine();
		}

		bufferedReader.close();
	}

	/**	Loads standard spellings from a URL.
	 *
	 *	@param	url			URL containing standard spellings
	 *	@param	encoding	Character set encoding for spellings
	 */

	public void loadStandardSpellings
	(
		URL url ,
		String encoding
	)
		throws IOException
	{
		if ( url != null )
		{
			loadStandardSpellings
			(
				new UnicodeReader( url.openStream() , encoding )
			);
		}
	}

	/**	Loads standard spellings from a reader.
	 *
	 *	@param	reader		The reader.
	 */

	public void loadStandardSpellings
	(
		Reader reader
	)
		throws IOException
	{
		BufferedReader bufferedReader	= new BufferedReader( reader );

		String spelling	= bufferedReader.readLine();

		while ( spelling != null )
		{
			addStandardSpelling( spelling.trim() );

			spelling	= bufferedReader.readLine();
		}

		bufferedReader.close();
	}

	/**	Add a mapped spelling.
	 *
	 *	@param	alternateSpelling	The alternate spelling.
	 *	@param	standardSpelling	The corresponding standard spelling.
	 */

	public void addMappedSpelling
	(
		String alternateSpelling ,
		String standardSpelling
	)
	{
		if	(	( mappedSpellings != null ) &&
				( standardSpelling != null ) &&
				( standardSpelling.length() > 0 ) &&
				( alternateSpelling != null ) &&
				( alternateSpelling.length() > 0 )
			)
		{
			mappedSpellings.putTag(
				alternateSpelling , standardSpelling );

			mappedSpellings.putTag(
				alternateSpelling.toLowerCase() , standardSpelling );

			addStandardSpelling( standardSpelling );
		}
	}

	/**	Add a standard spelling.
	 *
	 *	@param	standardSpelling	A standard spelling.
	 */

	public void addStandardSpelling
	(
		String standardSpelling
	)
	{
		if	(	( standardSpelling != null ) &&
				( standardSpelling.length() > 0 )
			)
		{
			standardSpellingSet.add( standardSpelling );

			standardSpellingSet.add( standardSpelling.toLowerCase() );
		}
	}

	/**	Add standard spellings from a collection.
	 *
	 *	@param	standardSpellings	A collection of standard spellings.
	 */

	public void addStandardSpellings
	(
		Collection<String> standardSpellings
	)
	{
		Iterator<String> iterator	= standardSpellings.iterator();

		while ( iterator.hasNext() )
		{
			String spelling	= iterator.next();

			addStandardSpelling( spelling );
		}
	}

	/**	Cached a generated mapped spelling.
	 *
	 *	@param	alternateSpelling	The alternate spelling.
	 *	@param	standardSpelling	The corresponding standard spelling.
	 */

	public void addCachedSpelling
	(
		String alternateSpelling ,
		String standardSpelling
	)
	{
		if	(	( mappedSpellings != null ) &&
				( standardSpelling != null ) &&
				( standardSpelling.length() > 0 ) &&
				( alternateSpelling != null ) &&
				( alternateSpelling.length() > 0 )
			)
		{
			mappedSpellings.putTag(
				alternateSpelling , standardSpelling );

			mappedSpellings.putTag(
				alternateSpelling.toLowerCase() , standardSpelling );
		}
	}

	/**	Sets map which maps alternate spellings to standard spellings.
	 *
	 *	@param	mappedSpellings		Map with alternate spellings as keys
	 *							and standard spellings as values.
	 */

	public void setMappedSpellings( TaggedStrings mappedSpellings )
	{
		this.mappedSpellings	= mappedSpellings;
	}

	/**	Sets standard spellings.
	 *
	 *	@param	standardSpellings		Set of standard spellings.
	 */

	public void setStandardSpellings( Set<String> standardSpellings )
	{
		this.standardSpellingSet	= standardSpellings;
	}

	/**	Returns standard spellings given a spelling.
	 *
	 *	@param	spelling	The spelling.
	 *
	 *	@return				The standard spellings as an array of String.
	 *
	 *	<p>
	 *	If not spelling map is defined, the spelling is returned
	 *	unchanged.
	 *	</p>
	 */

	 public String[] standardizeSpelling( String spelling )
	 {
		String result				= spelling;
		String lowerCaseSpelling	= spelling.toLowerCase();

		if ( mappedSpellings != null )
		{
								//	Check if given spelling exists
								//	in spelling map.  If so, return
								//	associated standard spelling.

			if ( mappedSpellings.containsString( spelling ) )
		 	{
	 			result	= mappedSpellings.getTag( spelling );
		 	}
								//	Check if lower case form of given
								//	spelling exists in spelling map.
								//	If so, return associated standard
								//	spelling.

		 	else if	( mappedSpellings.containsString( lowerCaseSpelling ) )
	 		{
	 			result	= mappedSpellings.getTag( lowerCaseSpelling );
		 	}
		 						//	If spelling contains dashes,
		 						//	evict them and try looking up
		 						//	the resulting spelling in regular
		 						//	and lower case form.

			else if ( CharUtils.hasDash( spelling ) )
			{
				String spellingNoDashes	= CharUtils.evictDashes( spelling );

								//	Check if no-dashes spelling exists
								//	in spelling map.  If so, return
								//	associated standard spelling.

				if ( mappedSpellings.containsString( spellingNoDashes ) )
			 	{
	 				result	= mappedSpellings.getTag( spellingNoDashes );
		 		}
								//	Check if lower case form of no-dashes
								//	spelling exists in spelling map.
								//	If so, return associated standard
								//	spelling.

		 		else if	( mappedSpellings.containsString(
		 			spellingNoDashes.toLowerCase() ) )
	 			{
	 				result	=
	 					mappedSpellings.getTag(
	 						spellingNoDashes.toLowerCase() );
		 		}
			}
		}

		result	= fixCapitalization( spelling , result );

	 	return new String[]{ result };
	 }

	/**	Returns a standard spelling given a standard or alternate spelling.
	 *
	 *	@param	spelling	The spelling.
	 *	@param	wordClass	The major word class.
	 *
	 *	@return				The standard spelling.
	 */

	 public String standardizeSpelling( String spelling , String wordClass )
	 {
								//	Get lowercase form of spelling.

	 	String lcSpelling	= spelling.toLowerCase();

								//	See if we have a standard spelling
								//	defined for this word class.  Try
								//	original case first, then lower case.
		String result		=
			(String)spellingsByWordClass.get( wordClass , spelling );

		if ( result	== null )
		{
			result		=
				(String)spellingsByWordClass.get( wordClass , lcSpelling );
		}
								//	If not, get a list of suggested
								//	standard spellings without regard
								//	to word class.
		if ( result == null )
        {
		 	String[] suggestions	= standardizeSpelling( spelling );

								//	If we got any suggested spellings,
								//	choose the last (e.g., best).

			if ( suggestions.length > 0 )
			{
				result	= suggestions[ suggestions.length - 1 ];
			}
		}
                                //	No standard spelling found so far?
                                //	Return the original spelling.
		if ( result	== null )
		{
			result	= spelling;
		}

		return result;
	}

	 /** Returns number of alternate spellings.
	  *
	  *	@return		The number of alternate spellings.
	  */

	public int getNumberOfAlternateSpellings()
	{
		int	result	= 0;

		if ( mappedSpellings != null )
		{
			result	= mappedSpellings.getStringCount();
		}

		return result;
	}

	 /** Returns number of alternate spellings by word class.
	  *
	  *	@return		int array with two entries.
	  *				[0]	=	The number of alternate spellings word classes.
	  *				[1]	=	The number of alternate spellings in the
	  *						word classes.
	  */

	public int[] getNumberOfAlternateSpellingsByWordClass()
	{
		int[] result	= new int[ 2 ];

		result[ 0 ]		= 0;
		result[ 1 ]		= 0;

		if ( alternateSpellingsWordClasses != null )
		{
			result[ 0 ]	= alternateSpellingsWordClasses.size();
		}

		if ( spellingsByWordClass != null )
		{
			result[ 1 ]	= spellingsByWordClass.size();
		}

		return result;
	}

	 /** Returns number of standard spellings.
	  *
	  *	@return		The number of standard spellings.
	  */

	public int getNumberOfStandardSpellings()
	{
		int	result	= 0;

		if ( standardSpellingSet != null )
		{
			result	= standardSpellingSet.size();
		}

		return result;
	}

	/**	Return the mapped spellings.
	 *
	 *	@return		The spelling tagged strings with (alternate spelling,
	 *				standard spelling) pairs.  May be null if
	 *				this standardizer does not use such a map.
	 */

	public TaggedStrings getMappedSpellings()
	{
		return mappedSpellings;
	}

	/**	Return the standard spellings.
	 *
	 *	@return		The standard spellings as a Set.
	 *				May be null.
	 */

	public Set<String> getStandardSpellings()
	{
		return standardSpellingSet;
	}

	/**	Preprocess spelling.
	 *
	 *	@param	spelling	Spelling to preprocess.
	 *
	 *	@return				Preprocessed spelling.
	 *
	 *	<p>
	 *	By default, no preprocessing is applied; the original spelling
	 *	is returned unchanged.
	 *	</p>
	 */

	public String preprocessSpelling( String spelling )
	{
		return spelling;
	}

	/**	Fix capitalization of standardized spelling.
	 *
	 *	@param	spelling			The original spelling.
	 *	@param	standardSpelling	The candidate standard spelling.
	 *
	 *	@return						Standard spelling with initial
	 *								capitalization matching original
	 *								spelling.
	 */

	public String fixCapitalization
	(
		String spelling ,
		String standardSpelling
	)
	{
		return CharUtils.makeCaseMatch( standardSpelling , spelling );
	}

	/**	Get the logger.
	 *
	 *	@return		The logger.
	 */

	public Logger getLogger()
	{
		return logger;
	}

	/**	Set the logger.
	 *
	 *	@param	logger		The logger.
	 */

	public void setLogger( Logger logger )
	{
		this.logger	= logger;
	}

	/**	Get the word lexicon.
	 *
	 *	@return		The static word lexicon.
	 */

	public Lexicon getLexicon()
	{
		return lexicon;
	}

	/**	Set the lexicon.
	 *
	 *	@param	lexicon		Lexicon used for tagging.
	 */

	public void setLexicon( Lexicon lexicon )
	{
		this.lexicon	= lexicon;
	}
}

/*
Copyright (c) 2008, 2009 by Northwestern University.
All rights reserved.

Developed by:
   Academic and Research Technologies
   Northwestern University
   http://www.it.northwestern.edu/about/departments/at/

Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal with the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:

    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimers.

    * Redistributions in binary form must reproduce the above
      copyright notice, this list of conditions and the following
      disclaimers in the documentation and/or other materials provided
      with the distribution.

    * Neither the names of Academic and Research Technologies,
      Northwestern University, nor the names of its contributors may be
      used to endorse or promote products derived from this Software
      without specific prior written permission.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
*/