DefaultNameRecognizer.java example

Explorer
morphadorner-opensource-master
- src
package edu.northwestern.at.utils.corpuslinguistics.namerecognizer;

/*	Please see the license information at the end of this file. */

import java.io.*;
import java.net.*;
import java.util.*;

import edu.northwestern.at.utils.*;
import edu.northwestern.at.utils.corpuslinguistics.adornedword.*;
import edu.northwestern.at.utils.corpuslinguistics.lexicon.*;
import edu.northwestern.at.utils.corpuslinguistics.partsofspeech.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.*;
import edu.northwestern.at.utils.corpuslinguistics.sentencesplitter.*;
import edu.northwestern.at.utils.corpuslinguistics.tokenizer.*;
import edu.northwestern.at.utils.logger.*;

import edu.northwestern.at.morphadorner.tools.*;

/**	DefaultNameRecognizer extracts proper names from text.
 */

public class DefaultNameRecognizer
	extends AbstractNameRecognizer
	implements NameRecognizer, UsesLogger
{
	/**	Create default name recognizer.
	 */

	public DefaultNameRecognizer()
	{
								//	Get part of speech tags.
		try
		{
			partOfSpeechTags	= new DefaultPartOfSpeechTags();
        }
        catch ( Exception e )
        {
        }
	}

	/**	Returns names from text.
	 *
	 *	@param	text	The text from which to extract names.
	 *
	 *	@return			Array of Set of names and places as strings.
	 *						[0]	= Set of proper names.
	 *						[1]	= Set of places.
	 */

	public Set<String>[] findNames( String text )
	{
								//	Make sure part of speech tagger
								//	is defined.

		if ( partOfSpeechTagger == null )
		{
			setPartOfSpeechTagger( null );
		}
								//	Extract sentences from text.
								//	Names are not allowed to cross
								//	sentence boundaries.

		List<List<String>> sentences	=
			sentenceSplitter.extractSentences( text , wordTokenizer );

								//	Get part of speech tags for each
								//	word in the text.

		List<List<AdornedWord>> taggedSentences	=
			partOfSpeechTagger.tagSentences( sentences );

								//	Get names in tagged sentences.

		return findNames( taggedSentences );
	}

	/**	Returns names from list of adorned word sentences.
	 *
	 *	@param	sentences	The list of sentences of adorned words
	 *						from which to extract names.
	 *
	 *	@return			Array of Set of names and places.
	 *					[0]	= Set of proper names.
	 *					[1]	= Set of places.
	 */

	public <T extends AdornedWord> Set<String>[] findNames
	(
	 	List<List<T>> sentences
	)
	{
								//	Get name positions.

		List<NamePosition>[] positions		= findNamePositions( sentences );

		List<NamePosition> namePositions	= positions[ 0 ];
		List<NamePosition> placePositions	= positions[ 1 ];

								//	Holds names and places extracted
								//	from text.

		Set<String> namesSet	= SetFactory.createNewSet();
		Set<String> placesSet	= SetFactory.createNewSet();

								//	Convert name positions to names.

		for ( int i = 0 ; i < namePositions.size() ; i++ )
		{
			namesSet.add
			(
				namePositionToName( sentences , namePositions.get( i ) )
			);
		}
								//	Convert place positions to place names.

		for ( int i = 0 ; i < placePositions.size() ; i++ )
		{
			placesSet.add
			(
				namePositionToName( sentences , placePositions.get( i ) )
			);
		}
								//	Return name and place sets.

		@SuppressWarnings("unchecked")
		Set<String>[] result	= (Set<String>[])new Set[ 2 ];
		result[ 0 ]				= namesSet;
		result[ 1 ]				= placesSet;

		return result;
	}

	/**	Returns name positions in list of adorned word sentences.
	 *
	 *	@param	sentences	The list of sentences of adorned words
	 *						from which to extract names.
	 *
	 *	@return			List of name positions of names and places.
	 *					[0]	= Positions of proper names.
	 *					[1]	= Position of places.
	 */

	public <T extends AdornedWord> List<NamePosition>[] findNamePositions
	(
		List<List<T>> sentences
	)
	{
								//	Holds lists of name positions
								//	extracted from text.

		List<NamePosition> namePositions	= ListFactory.createNewList();
		List<NamePosition> placePositions	= ListFactory.createNewList();

								//	Scan each tagged sentence for
								//	names.

		for ( int j = 0 ; j < sentences.size() ; j++ )
		{
								//	Get next tagged sentence.

			List<T> sentence	= sentences.get( j );

								//	Initialize name position.

			int properNounCount	= 0;
			int startingWord	= -1;
			int endingWord		= -1;
			int wordCount		= 0;

								//	Loop over each word in sentence
								//	and pick up next noun phrase.

			for ( int k = 0 ; k < sentence.size() ; k++ )
			{
								//	Get next word in sentence.

				AdornedWord word	= (AdornedWord)sentence.get( k );

								//	If word is a proper noun, or a noun that
								//	starts with a capital letter, append
								//	the word to the current noun phrase.

				String spelling		= word.getSpelling();
				String posTag		= word.getPartsOfSpeech();

				if	( 	partOfSpeechTags.isProperNounTag( posTag ) ||
						( 	partOfSpeechTags.isNounTag( posTag ) &&
							CharUtils.isFirstLetterCapital( spelling )
						)
					)
				{
					if ( startingWord == -1 )
					{
						startingWord	= k;
					}

					endingWord	= k;
            		wordCount++;

								//	If this word was a proper noun,
								//	increment the count of proper nouns
								//	in this noun phrase.

					if ( partOfSpeechTags.isProperNounTag( posTag ) )
					{
						properNounCount++;
					}
				}
								//	If the word isn't a noun, end the
								//	current noun phrase.
				else
				{
					if ( wordCount > 0 )
					{
						NamePosition namePosition	=
							new NamePosition
							(
								j ,
								startingWord ,
								endingWord ,
								properNounCount
							);

								//	In order for the noun phrase to be
								//	a name, we require at least one of
								//	the constituent words to have been
								//	a proper noun.
								//
								//	If the noun phrase is in the list of
								//	locations, add it to the set of
								//	extracted place names, else add it
								//	to the list of extracted person names.

						if ( validateNamePosition( sentences , namePosition ) )
						{
							String name	=
								namePositionToName
								(
									sentences ,
									namePosition
								);

							if ( names.isPlaceName( name ) )
							{
								placePositions.add( namePosition );
							}
							else
							{
								namePositions.add( namePosition );
							}
						}

						properNounCount	= 0;
						startingWord	= -1;
						endingWord		= -1;
						wordCount		= 0;
					}
				}
			}
                                //	Finished sentence.  Add any
                                //	remaining noun phrase to the
                                //	place name or person name set.

			if ( wordCount > 0 )
			{
				NamePosition namePosition	=
					new NamePosition
					(
						j ,
						startingWord ,
						endingWord ,
						properNounCount
					);

				if ( validateNamePosition( sentences , namePosition ) )
				{
					String name	=
						namePositionToName( sentences , namePosition );

					if ( names.isPlaceName( name ) )
					{
						placePositions.add( namePosition );
					}
					else
					{
						namePositions.add( namePosition );
					}
				}
			}
		}
								//	Return name and place lists.

		@SuppressWarnings("unchecked")
		List<NamePosition>[] result	= (List<NamePosition>[])new List[ 2 ];
		result[ 0 ]					= namePositions;
		result[ 1 ]					= placePositions;

		return result;
	}

	/**	Check name for validity.
	 *
	 *	@param	sentences		The collection of sentences.
	 *	@param	namePosition	The possibly updated name position.
	 *
	 *	@return					true if name is valid.
	 */

	public <T extends AdornedWord> boolean validateNamePosition
	(
		List<List<T>> sentences ,
		NamePosition namePosition
	)
	{
		List<T> sentence	= sentences.get( namePosition.sentence );

		if ( sentence.get( namePosition.startingWord ).toString().equals( "Will" ) )
		{
			if	(	( namePosition.endingWord > namePosition.startingWord ) &&
					names.isNamePrefix
					(
						sentence.get( namePosition.startingWord + 1 ).toString()
					)
				)
			{
				namePosition.startingWord++;
				namePosition.properNounCount--;
			}
		}

		return ( namePosition.properNounCount > 0 );
	}
}

/*
Copyright (c) 2008, 2009 by Northwestern University.
All rights reserved.

Developed by:
   Academic and Research Technologies
   Northwestern University
   http://www.it.northwestern.edu/about/departments/at/

Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal with the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:

    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimers.

    * Redistributions in binary form must reproduce the above
      copyright notice, this list of conditions and the following
      disclaimers in the documentation and/or other materials provided
      with the distribution.

    * Neither the names of Academic and Research Technologies,
      Northwestern University, nor the names of its contributors may be
      used to endorse or promote products derived from this Software
      without specific prior written permission.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
*/