package edu.northwestern.at.morphadorner.servlets;
/* Please see the license information at the end of this file. */
import java.io.*;
import java.net.*;
import java.util.*;
import javax.servlet.*;
import javax.servlet.http.*;
import edu.northwestern.at.utils.*;
import edu.northwestern.at.utils.html.*;
import edu.northwestern.at.utils.corpuslinguistics.languagerecognizer.*;
import edu.northwestern.at.utils.corpuslinguistics.lemmatizer.*;
import edu.northwestern.at.utils.corpuslinguistics.lexicon.*;
import edu.northwestern.at.utils.corpuslinguistics.namerecognizer.*;
import edu.northwestern.at.utils.corpuslinguistics.partsofspeech.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.guesser.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.propernounretagger.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.transitionmatrix.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.trigram.*;
import edu.northwestern.at.utils.corpuslinguistics.sentencesplitter.*;
import edu.northwestern.at.utils.corpuslinguistics.spellingstandardizer.*;
import edu.northwestern.at.utils.corpuslinguistics.stemmer.*;
import edu.northwestern.at.utils.corpuslinguistics.tokenizer.*;
import edu.northwestern.at.utils.servlets.*;
/** Adorner information for MorphAdorner example servlets.
*
* <p>
* All the fields are public.
* </p>
*/
public class AdornerInfo
{
/** The sentence splitter. */
public SentenceSplitter extractor;
/** The part of speech guesser. */
public PartOfSpeechGuesser partOfSpeechGuesser;
/** The word lexicon. */
public Lexicon wordLexicon;
/** Maps lemma to list of spellings in lexicon. */
public KeyedSets<String, String> lemmaToSpellings;
/** The suffix lexicon. */
public Lexicon suffixLexicon;
/** Simple spelling standardizer. */
public ExtendedSimpleSpellingStandardizer simpleStandardizer;
/** Extended search spelling standardizer. */
public ExtendedSearchSpellingStandardizer standardizer;
/** Part of speech tags. */
public PartOfSpeechTags partOfSpeechTags;
/** The part of speech tagger. */
public PartOfSpeechTagger tagger;
/** The part of speech retagger. */
public PartOfSpeechRetagger retagger;
/** Transition matrix. */
public TransitionMatrix transitionMatrix;
/** The name recognizer. */
public NameRecognizer nameRecognizer;
/** Create adorner info object.
*
* @param wordLexiconFileName Word lexicon file name.
* @param suffixLexiconFileName Suffix lexicon file name.
* @param transitionMatrixFileName Part of speech transition matrix
* file name.
* @param standardSpellingsFileName Standard spellings file name.
* @param extraWordLists TaggedStrings array of extra
* word lists.
* @param names Names list.
*/
public AdornerInfo
(
String wordLexiconFileName ,
String suffixLexiconFileName ,
String transitionMatrixFileName ,
String standardSpellingsFileName ,
String alternateSpellingsFileName ,
TaggedStrings[] extraWordLists ,
Names names
)
throws Exception
{
// Get word lexicon.
wordLexicon = new DefaultLexicon();
// Load word lexicon.
wordLexicon.loadLexicon
(
new File( wordLexiconFileName ).toURI().toURL() ,
"utf-8"
);
// Get suffix lexicon.
suffixLexicon = new DefaultSuffixLexicon();
// Load suffix lexicon.
suffixLexicon.loadLexicon
(
new File( suffixLexiconFileName ).toURI().toURL() ,
"utf-8"
);
// Get part of speech tags.
partOfSpeechTags = wordLexicon.getPartOfSpeechTags();
// Get part of speech guessers.
partOfSpeechGuesser = new DefaultPartOfSpeechGuesser();
partOfSpeechGuesser.setWordLexicon( wordLexicon );
partOfSpeechGuesser.setSuffixLexicon( suffixLexicon );
// Get sentence splitter.
extractor = new DefaultSentenceSplitter();
// Set guesser into sentence splitter.
extractor.setPartOfSpeechGuesser( partOfSpeechGuesser );
// Create trigram part of speech tagger.
tagger = new TrigramTagger();
// Add proper noun retagger.
retagger = new ProperNounRetagger();
tagger.setRetagger( retagger );
// Add auxiliary word lists to guesser.
for ( int i = 0 ; i < extraWordLists.length ; i++ )
{
partOfSpeechGuesser.addAuxiliaryWordList( extraWordLists[ i ] );
}
// Set tagger to use lexicon.
tagger.setLexicon( wordLexicon );
// Set guesser into tagger.
tagger.setPartOfSpeechGuesser( partOfSpeechGuesser );
// Load transition matrix.
TransitionMatrix transitionMatrix =
new TransitionMatrix();
transitionMatrix.loadTransitionMatrix
(
new File( transitionMatrixFileName ).toURI().toURL() ,
"utf-8",
'\t'
);
tagger.setTransitionMatrix( transitionMatrix );
// Get extended search standardizer.
standardizer = new ExtendedSearchSpellingStandardizer();
// Load standard spellings.
standardizer.loadStandardSpellings
(
new File( standardSpellingsFileName ).toURI().toURL() ,
"utf-8"
);
// Add name lists to standard spellings.
standardizer.addStandardSpellings( names.getFirstNames() );
standardizer.addStandardSpellings( names.getSurnames() );
standardizer.addStandardSpellings( names.getPlaceNames().keySet() );
// Load alternate/standard spelling pairs.
standardizer.loadAlternativeSpellings
(
new File( alternateSpellingsFileName ).toURI().toURL() ,
"utf-8" ,
"\t"
);
// Get simple spelling standardizer.
simpleStandardizer = new ExtendedSimpleSpellingStandardizer();
// Set pairs list into simple
// standardizer as well.
simpleStandardizer.setMappedSpellings(
standardizer.getMappedSpellings() );
simpleStandardizer.setStandardSpellings(
standardizer.getStandardSpellings() );
// Create name recognizer.
nameRecognizer = new DefaultNameRecognizer();
nameRecognizer.setPartOfSpeechTagger( tagger );
// Map lemmata to spellings in
// word lexicon.
String[] spellings = wordLexicon.getEntries();
lemmaToSpellings = new KeyedSets<String, String>();
for ( int i = 0 ; i < spellings.length ; i++ )
{
String spelling = spellings[ i ];
String[] lemmata = wordLexicon.getLemmata( spelling );
for ( int j = 0 ; j < lemmata.length ; j++ )
{
lemmaToSpellings.add( lemmata[ j ] , spelling );
}
}
}
}