AdornerInfo.java example

Explorer
morphadorner-opensource-master
- src
package edu.northwestern.at.morphadorner.servlets;

/*	Please see the license information at the end of this file. */

import java.io.*;
import java.net.*;
import java.util.*;

import javax.servlet.*;
import javax.servlet.http.*;

import edu.northwestern.at.utils.*;
import edu.northwestern.at.utils.html.*;
import edu.northwestern.at.utils.corpuslinguistics.languagerecognizer.*;
import edu.northwestern.at.utils.corpuslinguistics.lemmatizer.*;
import edu.northwestern.at.utils.corpuslinguistics.lexicon.*;
import edu.northwestern.at.utils.corpuslinguistics.namerecognizer.*;
import edu.northwestern.at.utils.corpuslinguistics.partsofspeech.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.guesser.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.propernounretagger.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.transitionmatrix.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.trigram.*;
import edu.northwestern.at.utils.corpuslinguistics.sentencesplitter.*;
import edu.northwestern.at.utils.corpuslinguistics.spellingstandardizer.*;
import edu.northwestern.at.utils.corpuslinguistics.stemmer.*;
import edu.northwestern.at.utils.corpuslinguistics.tokenizer.*;

import edu.northwestern.at.utils.servlets.*;

/**	Adorner information for MorphAdorner example servlets.
 *
 *	<p>
 *	All the fields are public.
 *	</p>
 */

public class AdornerInfo
{
	/**	The sentence splitter. */

	public SentenceSplitter extractor;

	/**	The part of speech guesser. */

	public PartOfSpeechGuesser partOfSpeechGuesser;

	/**	The word lexicon. */

	public Lexicon wordLexicon;

	/**	Maps lemma to list of spellings in lexicon. */

	public KeyedSets<String, String> lemmaToSpellings;

	/**	The suffix lexicon. */

	public Lexicon suffixLexicon;

	/**	Simple spelling standardizer. */

	public ExtendedSimpleSpellingStandardizer simpleStandardizer;

	/**	Extended search spelling standardizer. */

	public ExtendedSearchSpellingStandardizer standardizer;

	/**	Part of speech tags. */

	public PartOfSpeechTags partOfSpeechTags;

	/**	The part of speech tagger. */

	public PartOfSpeechTagger tagger;

	/**	The part of speech retagger. */

	public PartOfSpeechRetagger retagger;

	/**	Transition matrix. */

	public TransitionMatrix transitionMatrix;

	/**	The name recognizer. */

	public NameRecognizer nameRecognizer;

	/**	Create adorner info object.
	 *
	 *	@param	wordLexiconFileName			Word lexicon file name.
	 *	@param	suffixLexiconFileName		Suffix lexicon file name.
	 *	@param	transitionMatrixFileName	Part of speech transition matrix
	 *										file name.
	 *	@param	standardSpellingsFileName	Standard spellings file name.
	 *	@param	extraWordLists				TaggedStrings array of extra
	 *										word lists.
	 *	@param	names						Names list.
	 */

	public AdornerInfo
	(
		String wordLexiconFileName ,
		String suffixLexiconFileName ,
		String transitionMatrixFileName ,
		String standardSpellingsFileName ,
		String alternateSpellingsFileName ,
		TaggedStrings[] extraWordLists ,
		Names names
	)
		throws Exception
	{
								//	Get word lexicon.

		wordLexicon	= new DefaultLexicon();

								//	Load word lexicon.

		wordLexicon.loadLexicon
		(
			new File( wordLexiconFileName ).toURI().toURL() ,
			"utf-8"
		);
								//	Get suffix lexicon.

		suffixLexicon	= new DefaultSuffixLexicon();

								//	Load suffix lexicon.

		suffixLexicon.loadLexicon
		(
			new File( suffixLexiconFileName ).toURI().toURL() ,
			"utf-8"
		);
								//	Get part of speech tags.

		partOfSpeechTags	= wordLexicon.getPartOfSpeechTags();

								//	Get part of speech guessers.

		partOfSpeechGuesser	= new DefaultPartOfSpeechGuesser();

		partOfSpeechGuesser.setWordLexicon( wordLexicon );
		partOfSpeechGuesser.setSuffixLexicon( suffixLexicon );

								//	Get sentence splitter.

		extractor	= new DefaultSentenceSplitter();

								//	Set guesser into sentence splitter.

		extractor.setPartOfSpeechGuesser( partOfSpeechGuesser );

								//	Create trigram part of speech tagger.

		tagger		= new TrigramTagger();

								//	Add proper noun retagger.

		retagger	= new ProperNounRetagger();

		tagger.setRetagger( retagger );

								//	Add auxiliary word lists to guesser.

		for ( int i = 0 ; i < extraWordLists.length ; i++ )
		{
			partOfSpeechGuesser.addAuxiliaryWordList( extraWordLists[ i ] );
		}
								//	Set tagger to use lexicon.

		tagger.setLexicon( wordLexicon );

								//	Set guesser into tagger.

		tagger.setPartOfSpeechGuesser( partOfSpeechGuesser );

								//	Load transition matrix.

		TransitionMatrix transitionMatrix	=
			new TransitionMatrix();

		transitionMatrix.loadTransitionMatrix
		(
			new File( transitionMatrixFileName ).toURI().toURL() ,
			"utf-8",
			'\t'
		);

		tagger.setTransitionMatrix( transitionMatrix );

								//	Get extended search standardizer.

		standardizer	= new ExtendedSearchSpellingStandardizer();

								//	Load standard spellings.

		standardizer.loadStandardSpellings
		(
			new File( standardSpellingsFileName ).toURI().toURL() ,
			"utf-8"
		);
                                //	Add name lists to standard spellings.

		standardizer.addStandardSpellings( names.getFirstNames() );

		standardizer.addStandardSpellings( names.getSurnames() );

		standardizer.addStandardSpellings( names.getPlaceNames().keySet() );

								//	Load alternate/standard spelling pairs.

		standardizer.loadAlternativeSpellings
		(
			new File( alternateSpellingsFileName ).toURI().toURL() ,
			"utf-8" ,
			"\t"
		);
        						//	Get simple spelling standardizer.

		simpleStandardizer	= new ExtendedSimpleSpellingStandardizer();

								//	Set pairs list into simple
								//	standardizer as well.

		simpleStandardizer.setMappedSpellings(
			standardizer.getMappedSpellings() );

		simpleStandardizer.setStandardSpellings(
			standardizer.getStandardSpellings() );

								//	Create name recognizer.

		nameRecognizer	= new DefaultNameRecognizer();

		nameRecognizer.setPartOfSpeechTagger( tagger );

								//	Map lemmata to spellings in
								//	word lexicon.

		String[] spellings	= wordLexicon.getEntries();

		lemmaToSpellings	= new KeyedSets<String, String>();

		for ( int i = 0 ; i < spellings.length ; i++ )
		{
			String spelling		= spellings[ i ];
			String[] lemmata	= wordLexicon.getLemmata( spelling );

			for ( int j = 0 ; j < lemmata.length ; j++ )
			{
				lemmaToSpellings.add( lemmata[ j ] , spelling );
			}
		}
	}
}