Names.java example

Explorer
morphadorner-opensource-master
- src
package edu.northwestern.at.utils.corpuslinguistics.namerecognizer;

/*	Please see the license information at the end of this file. */

import java.util.*;
import java.io.*;

import edu.northwestern.at.utils.*;
import edu.northwestern.at.utils.corpuslinguistics.lexicon.*;
import edu.northwestern.at.utils.corpuslinguistics.tokenizer.*;

/**	Extract person and place names from text.
 *
 *	<p>
 *	Uses lists of first names, surnames, and geographic locations
 *	to extract person names and locations from text.
 *	</p>
 *
 *	<p>
 *	Based in part on code written by Mark Watson.
 *	</p>
 */

public class Names
{
	/** Default name resource data files. */

	protected static String defaultResourcePath = "resources/";

	/**	Surname set. */

	protected static Set<String> surnameSet = null;

	/**	First name set. */

	protected static Set<String> firstNameSet = null;

	/**	Place name map. */

	protected static Map<String, String> placeNameMap = null;

	/**	Prefix title set. */

	protected static Set<String> prefixSet = null;

	/**	Name connectors set. */

	protected static Set<String> connectorsSet = null;

	/**	Create name extractor. */

	public Names()
	{
		this( defaultResourcePath );
	}

	/**	Create name extractor.
	 *
	 *	@param	resourcePath	Path to resource files.
	 */

	public Names( String resourcePath )
	{
								//	Load static name data if not
								//	already loaded.

		if ( surnameSet != null ) return;

		try
		{
			surnameSet =
				SetUtils.loadSet
				(
					Names.class.getResource
					(
						resourcePath + "lastnames.txt"
					),
					"utf-8"
				);

			firstNameSet =
				SetUtils.loadSet
				(
					Names.class.getResource
					(
						resourcePath + "firstnames.txt"
					),
					"utf-8"
				);

			placeNameMap =
				MapUtils.loadMap
				(
					Names.class.getResource
					(
						resourcePath + "placenames.txt"
					),
					"\t",
					"",
					"utf-8"
				);

			prefixSet =
				SetUtils.loadSet
				(
					Names.class.getResource
					(
						resourcePath + "prefixes.txt"
					),
					"utf-8"
				);

			connectorsSet =
				SetUtils.loadSet
				(
					Names.class.getResource
					(
						resourcePath + "connectors.txt"
					),
					"utf-8"
				);
		}
		catch ( Exception e )
		{
//			e.printStackTrace();
		}
	}

	/**	See if string is a name or a place.
	 *
	 *	@param	s	The string to check.
	 *
	 *	@return		true if the string is a name or a place.
	 */

	public boolean isNameOrPlace( String s )
	{
		return
			firstNameSet.contains( s ) ||
			surnameSet.contains( s ) ||
			placeNameMap.containsKey( s );
	}

	/**	Accept a name.
	 *
	 *	@param	lexicon		Word lexicon.
	 *	@param	name		The text of the name.
	 *	@param	firstWord	True if the name starts with the first word
	 *						in a sentence.
	 *	@param	numWords	The number of words in the name.
	 *
	 *	@return				true if the name should be accepted as such.
	 */

	protected boolean acceptName
	(
		Lexicon lexicon ,
		String name ,
		boolean firstWord ,
		int numWords
	)
	{
		boolean result	= ( name.length() > 0 );

		if ( result && firstWord && ( numWords == 1 ) )
		{
			if ( lexicon != null )
			{
				String lowerCaseName	= name.toLowerCase().trim();

				result	= !lexicon.containsEntry( lowerCaseName );
			}
		}

		return result;
	}

	/**	Extract all proper names for people and places from a list of words.
	 *
	 *	@param	words		String array of words to search for names.
	 *						This should correspond to a single sentence.
	 *
	 *	@param	lexicon		Lexicon for filtering names.
	 *
	 *	@return				Two element array containing two sets.
	 *						First set is a list of person names.
	 *						Second set is a list of place names.
	 */

	public Set<String>[] getProperNames( String[] words , Lexicon lexicon )
	{
								//	Create set to hold person names.

		Set<String> personNames	= SetFactory.createNewSet();

								//	Create set to hold place names.

		Set<String> placeNames	= SetFactory.createNewSet();

								//	If no words, return empty name sets.

		if ( ( words == null ) || ( words.length == 0 ) )
		{
			@SuppressWarnings("unchecked")
			Set<String>[] result	= (Set<String>[])new Set[ 2 ];

 			result[ 0 ]	= personNames;
			result[ 1 ]	= placeNames;

			return result;
		}
								//	Loop over word list and look
								//	for person and place name patterns.
		int i = 0 ;

		while( i < words.length )
		{
			for ( int j = 5 ; j > 0 ; j-- )
			{
								//	Look for a place name.

				String name;

				if ( j <= 3 )
				{
					name	= getPlaceName( words , i , j );

								//	We found a name.  Add it to the
								//	list of place names and move past
								//	the name to look for more.

					if	(	( name.length() > 0 ) &&
							acceptName( lexicon , name , ( i == 0 ) , j )
						)
					{
						placeNames.add( name );
						i += j - 1;
						break;
					}
				}
								//	Look for a person name.

				name	= getPersonName( words , i , j );

								//	We found a name.  Add it to the
								//	list of proper names and move past
								//	the name to look for more.

				if	(	( name.length() > 0 ) &&
						acceptName( lexicon , name , ( i == 0 ) , j )
					)
				{
					personNames.add( name );
					i += j - 1;
					break;
				}
			}

			i++;
		}

		@SuppressWarnings("unchecked")
		Set<String>[] result	= (Set<String>[])new Set[ 2 ];

 		result[ 0 ]	= personNames;
		result[ 1 ]	= placeNames;

		return result;
	}

	/**	Extract all proper names for people and places from a sstring.
	 *
	 *	@param	s			String to search for names.
	 *						This should correspond to a single sentence.
	 *
	 *	@param	lexicon		Lexicon for filtering names.
	 *
	 *	@return				Two element array containing two sets.
	 *						First set is a list of person names.
	 *						Second set is a list of place names.
	 */

	public Set<String>[] getProperNames( String s , Lexicon lexicon )
	{
								//	Get a word tokenizer.

		WordTokenizer wordTokenizer	= new DefaultWordTokenizer();

								//	Extract list of words from
								//	the input string using the
								//	tokenizer.

		List<String> wordsList = wordTokenizer.extractWords( s );

								//	Convert list of words to
								//	string array of words.

		String[] words =
			(String[])wordsList.toArray( new String[ wordsList.size() ] );

								//	Get names from list of words.

		return getProperNames( words , lexicon );
	}

	/**	Extract all proper names for people and places from a sstring.
	 *
	 *	@param	wordsList	List of words to search for names.
	 *						This should correspond to a single sentence.
	 *
	 *	@param	lexicon		Lexicon for filtering names.
	 *
	 *	@return				Two element array containing two sets.
	 *						First set is a list of person names.
	 *						Second set is a list of place names.
	 */

	public Set<String>[] getProperNames
	(
		List<String> wordsList ,
		Lexicon lexicon
	)
	{
								//	string array of words.

		String[] words =
			(String[])wordsList.toArray(new String[ wordsList.size() ]);

								//	Get names from list of words.

		return getProperNames( words , lexicon );
	}

	/**	Get a place name from a list of words.
	 *
	 *	@param	words		String array of words.
	 *	@param	startIndex	Start index in words array to check for a name.
	 *	@param	numWords	The number of words to check for a name.
	 *
	 *	@return				The place name, if found, or an empty string
	 *						if not found.
	 */

	public String getPlaceName
	(
		String[] words ,
		int startIndex ,
		int numWords
	)
	{
								//	Assume we don't find a name.

		String result	= "";

								//	If starting index plus the
								//	number of words to look at runs
								//	past the number of words, we can't
								//	extract a name of the specified
								//	length.

		if ( ( startIndex + numWords ) > words.length )
		{
			return result;
		}
								//	Concatenate words to form the
								//	potential place name.

		StringBuffer sb	= new StringBuffer();

		int endIndex	= startIndex + numWords - 1;

		for ( int i = startIndex ; i <= endIndex ; i++ )
		{
			sb.append( words[ startIndex ] );

			if ( i < endIndex )
			{
				sb.append( " " );
			}
		}
								//	If the concatenated words
								//	form a place name, return that
								//	name, otherwise return an
								//	empty string.

		String s	= sb.toString();

		if ( isPlaceName( s ) )
		{
			result	= s;
		}

		return result.trim();
	}

	/**	Get place name type.
	 *
	 *	@param	placeName	The place name.
	 *
	 *	@return				Place name type, or empty string if none.
	 */

	public String getPlaceNameType( String placeName )
	{
		String result	= (String)placeNameMap.get( placeName );

		if ( result == null )
		{
			result	= "";
		}

		return result.trim();
	}

	/**	Check if name is a place name.
	 *
	 *	@param	name	The name.
	 *
	 *	@return			true if it is a place name.
	 */

	public boolean isPlaceName( String name )
	{
								//	See if the name is in the map
								//	of place names.

		return ( placeNameMap.get( name ) != null );
	}

	/**	Check if word is a name prefix (Mr., Mrs., etc.).
	 *
	 *	@param	word	The word to check.
	 *
	 *	@return			true if it is a name prefix.
	 */

	public boolean isNamePrefix( String word )
	{
								//	See if the word is in the map
								//	of name prefixes.

		return ( ( word != null ) && prefixSet.contains( word ) );
	}

	/**	Check if string is a person name.
	 *
	 *	@param	s	The string.
	 *
	 *	@return		true if input string is a person name.
	 */

	public boolean isPersonName( String s )
	{
								//	Get a word tokenizer.

		WordTokenizer wordTokenizer	= new DefaultWordTokenizer();

								//	Extract list of words from
								//	the input string using the
								//	tokenizer.

		List<String> wordsList = wordTokenizer.extractWords( s );

								//	Convert list of words to
								//	string array of words.
		String[] words =
			(String[])wordsList.toArray(new String[ wordsList.size() ]);

								//	Get names from list of words.

		return isPersonName( words );
	}

	/**	Get a person name from a list of words.
	 *
	 *	@param	words		String array of words.
	 *	@param	startIndex	Start index in words array to check for a name.
	 *	@param	numWords	The number of words to check for a name.
	 *
	 *	@return				The person name, if found, or an empty string
	 *						if not found.
	 */

	public String getPersonName
	(
		String[] words ,
		int startIndex ,
		int numWords
	)
	{
								//	Assume we don't find a name.

		String result	= "";
								//	If starting index plus the
								//	number of words to look at runs
								//	past the number of words, we can't
								//	extract a name of the specified
								//	length.

		if ( ( startIndex + numWords ) > words.length )
		{
			return result;
		}
								//	Copy the words forming a potential
								//	name to a new string array.

		String[] sWords	= new String[ numWords ];

		int endIndex	= startIndex + numWords - 1;
		int j			= 0;

		for ( int i = startIndex ; i <= endIndex ; i++ )
		{
			sWords[ j++ ]	= words[ i ];
		}
								//	If the words form a person name,
								//	create a name string from the words.

		if ( isPersonName( sWords ) )
		{
			StringBuffer sb	= new StringBuffer();

			for ( int i = 0 ; i < sWords.length ; i++ )
			{
				sb.append( sWords[ i ] );

				if ( i < sWords.length )
				{
					sb.append( " " );
				}
			}

			result	= sb.toString();
		}

		return result.trim();
	}

	/**	Check if list of words form a person's name.
	 *
	 *	@param	words	The words.
	 *
	 *	@return			true if words form a person's name.
	 */

	public boolean isPersonName( String[] words )
	{
								//	Assume words do not form a name.

		boolean result	= false;

								//	Perform different checks depending
								//	upon number of words.

		switch( words.length )
		{
			case 1:
				result	=
					firstNameSet.contains( words[ 0 ] ) ||
					surnameSet.contains( words[ 0 ] )
					;
				break;

			case 2:
				result	=
					firstNameSet.contains( words[ 0 ] ) &&
					surnameSet.contains( words[ 1 ] );

				result	=
					result ||
						(
							prefixSet.contains( words[ 0 ] ) &&
							surnameSet.contains( words[ 1 ] )
						);
				break;

			case 3:
				result	=
					( firstNameSet.contains( words[ 0 ] ) ) &&
					( firstNameSet.contains( words[ 1 ] ) ) &&
					( surnameSet.contains( words[ 2 ] ) );

				result	=
					result ||
					( prefixSet.contains( words[ 0 ] ) ) &&
					( firstNameSet.contains( words[ 1 ] ) ) &&
					( surnameSet.contains( words[ 2 ] ) );

				result	=
					result ||
					( firstNameSet.contains( words[ 0 ] ) ) &&
					( connectorsSet.contains( words[ 1 ] ) ) &&
					( surnameSet.contains( words[ 2 ] ) );

				result	=
					result ||
					( firstNameSet.contains( words[ 0 ] ) ) &&
					( words[ 1 ].length() == 2 ) &&
					( words[ 1 ].endsWith( "." ) ) &&
					( surnameSet.contains( words[ 2 ] ) );

				break;

			case 4:
				result	=
					( firstNameSet.contains( words[ 0 ] ) ) &&
					( firstNameSet.contains( words[ 1 ] ) ) &&
					( firstNameSet.contains( words[ 2 ] ) ) &&
					( surnameSet.contains( words[ 3 ] ) );

				result	=
					result ||
					( prefixSet.contains( words[ 0 ] ) ) &&
					( firstNameSet.contains( words[ 1 ] ) ) &&
					( firstNameSet.contains( words[ 2 ] ) ) &&
					( surnameSet.contains( words[ 3 ] ) );

				result	=
					result ||
					( prefixSet.contains( words[ 0 ] ) ) &&
					( firstNameSet.contains( words[ 1 ] ) ) &&
					( words[ 2 ].length() == 1 ) &&
					( surnameSet.contains( words[ 3 ] ) );

				result	=
					( firstNameSet.contains( words[ 0 ] ) ) &&
					( firstNameSet.contains( words[ 1 ] ) ) &&
					( words[ 2 ].length() == 2 ) &&
					( words[ 2 ].endsWith( "." ) ) &&
					( surnameSet.contains( words[ 3 ] ) );

				result	=
					result ||
					( prefixSet.contains( words[ 0 ] ) ) &&
					( firstNameSet.contains( words[ 1 ] ) ) &&
					( words[ 2 ].length() == 2 ) &&
					( words[ 2 ].endsWith( "." ) ) &&
					( surnameSet.contains( words[ 3 ] ) );

				result	=
					result ||
					( firstNameSet.contains( words[ 0 ] ) ) &&
					( firstNameSet.contains( words[ 1 ] ) ) &&
					( firstNameSet.contains( words[ 2 ] ) ) &&
					( surnameSet.contains( words[ 3 ] ) );

				result	=
					result ||
					( firstNameSet.contains( words[ 0 ] ) ) &&
					( connectorsSet.contains( words[ 1 ] ) ) &&
					( connectorsSet.contains( words[ 2 ] ) ) &&
					( surnameSet.contains( words[ 3 ] ) );

				break;

			case 5:
				result	=
					result ||
					( prefixSet.contains( words[ 0 ] ) ) &&
					( firstNameSet.contains( words[ 1 ] ) ) &&
					( connectorsSet.contains( words[ 2 ] ) ) &&
					( connectorsSet.contains( words[ 3 ] ) ) &&
					( surnameSet.contains( words[ 4 ] ) );

				break;

			default:
				break;
		}

		return result;
	}

	/**	Return first name set.
	 *
	 *	@return		First name set.
	 */

	public Set<String> getFirstNames()
	{
		return firstNameSet;
	}

	/**	Return last name set.
	 *
	 *	@return		Last name set.
	 */

	public Set<String> getSurnames()
	{
		return surnameSet;
	}

	/**	Return place name set.
	 *
	 *	@return		Place name set.
	 */

	public Map<String, String> getPlaceNames()
	{
		return placeNameMap;
	}

	/**	Return prefix title set.
	 *
	 *	@return		Prefix title set.
	 */

	public Set<String> getPrefixes()
	{
		return prefixSet;
	}

	/**	Return name connectors set.
	 *
	 *	@return		Name connectors set.
	 */

	public Set<String> getConnectors()
	{
		return connectorsSet;
	}
}

/*
Copyright (c) 2008, 2009 by Northwestern University.
All rights reserved.

Developed by:
   Academic and Research Technologies
   Northwestern University
   http://www.it.northwestern.edu/about/departments/at/

Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal with the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:

    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimers.

    * Redistributions in binary form must reproduce the above
      copyright notice, this list of conditions and the following
      disclaimers in the documentation and/or other materials provided
      with the distribution.

    * Neither the names of Academic and Research Technologies,
      Northwestern University, nor the names of its contributors may be
      used to endorse or promote products derived from this Software
      without specific prior written permission.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
*/