/* * POSTagger.java * */ package org.opensextant.toolbox; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.danieldk.nlp.jitar.data.Model; import eu.danieldk.nlp.jitar.languagemodel.LanguageModel; import eu.danieldk.nlp.jitar.languagemodel.LinearInterpolationLM; import eu.danieldk.nlp.jitar.tagger.HMMTagger; import eu.danieldk.nlp.jitar.tagger.HMMTagger.Sequence; import eu.danieldk.nlp.jitar.wordhandler.KnownWordHandler; import eu.danieldk.nlp.jitar.wordhandler.SuffixWordHandler; import eu.danieldk.nlp.jitar.wordhandler.WordHandler; import gate.Annotation; import gate.AnnotationSet; import gate.ProcessingResource; import gate.Resource; import gate.creole.AbstractLanguageAnalyser; import gate.creole.ExecutionException; import gate.creole.ResourceInstantiationException; import gate.creole.metadata.CreoleParameter; import gate.creole.metadata.CreoleResource; import gate.creole.metadata.Optional; import gate.creole.metadata.RunTime; @CreoleResource(name = "OpenSextant_POS_Tagger", comment = "A POS tagger based on the JITAR ngram model") public class POSTagger extends AbstractLanguageAnalyser implements ProcessingResource { private static final long serialVersionUID = 1L; private String inputASName; private String outputASName; /** The lexicon and ngrams used by the tagger. */ private URL lexiconFileURL; private URL ngramFileURL; /** The lexicon and ngrams used by the gueser algoithm (suffix handler). */ private URL guesserLexiconFileURL; private URL guesserNgramFileURL; private transient HMMTagger tagger; private transient Model model; private transient Model guesserModel; /** Log object. */ private static final Logger LOGGER = LoggerFactory.getLogger(POSTagger.class); private void initialize() { // Load the model try { model = Model.readModel(new BufferedReader(new InputStreamReader(lexiconFileURL.openStream(), "UTF-8")), new BufferedReader(new InputStreamReader(ngramFileURL.openStream(), "UTF-8"))); guesserModel = Model.readModel( new BufferedReader(new InputStreamReader(guesserLexiconFileURL.openStream(), "UTF-8")), new BufferedReader(new InputStreamReader(guesserNgramFileURL.openStream(), "UTF-8"))); } catch (IOException e) { LOGGER.error("Unable to read the POS model!", e); } // Set up word handlers. The suffix word handler is used as a fallback // to the known word handler. int maxSuffixLength = 3; // low long a suffix to use to guess unknown // words int maxTrainFreqNum = 5; // max freq for numerical words int maxTrainFreqUppercase = 100000000; // max freq for uppercase words int maxTrainFreqLowercase = 100000000; // max freq for lower case words SuffixWordHandler swh = new SuffixWordHandler(guesserModel.lexicon(), model.uniGrams(), maxSuffixLength, maxTrainFreqNum, maxTrainFreqUppercase, maxTrainFreqLowercase, 10); WordHandler wh = new KnownWordHandler(model.lexicon(), model.uniGrams(), swh); // Create an n-gram language model. LanguageModel lm = new LinearInterpolationLM(model.uniGrams(), model.biGrams(), model.triGrams()); // Initialize a tagger with a beam of 1000.0. tagger = new HMMTagger(model, wh, lm, 1000.0); } @Override public Resource init() throws ResourceInstantiationException { initialize(); return this; } @Override public void reInit() throws ResourceInstantiationException { initialize(); } @Override public void execute() throws ExecutionException { if ("".equals(inputASName)) { inputASName = null; } AnnotationSet inputAS = (inputASName == null) ? document.getAnnotations() : document.getAnnotations(inputASName); // Get all of the sentences in document AnnotationSet sentenceSet = inputAS.get("Sentence"); // For every sentence: // Get the tokens which make up that sentence, // first as Annotation[], then as a List<String> Iterator<Annotation> sentIter = sentenceSet.iterator(); while (sentIter.hasNext()) { Annotation currSent = sentIter.next(); Long start = currSent.getStartNode().getOffset(); Long end = currSent.getEndNode().getOffset(); // Get the Tokens within the current sentence AnnotationSet tokensInSentence = inputAS.get("Token", start, end); List<Annotation> annoList = gate.Utils.inDocumentOrder(tokensInSentence); List<String> tokenList = new ArrayList<String>(); for (int i = 0; i < annoList.size(); i++) { // ASSUMPTION: every token has a feature "string" String tmpString = (String) annoList.get(i).getFeatures().get("string"); tokenList.add(tmpString); } // Add start/end markers, 2 starts and 1 end. tokenList.add(0, "<START>"); tokenList.add(0, "<START>"); tokenList.add("<END>"); int indexOffset = 2; String featureName = "pos"; // Send the tokens to the tagger Sequence seq = HMMTagger.highestProbabilitySequence(tagger.viterbi(tokenList), model); // Set the probability on the Sentence as feature "posProb" currSent.getFeatures().put("posProb", seq.logProb()); // get the tags from the sequence List<String> tags = seq.sequence(); // Attach the returned tag to each token as feature "pos" // skipping the 2 <START> and 1 <END> tags for (int i = 0; i < annoList.size(); i++) { annoList.get(i).getFeatures().put(featureName, tags.get(i + indexOffset)); } } // end sentence iterator } /** End execute(). */ public URL getLexiconFileURL() { return lexiconFileURL; } @CreoleParameter public void setLexiconFileURL(URL lexiconFileURL) { this.lexiconFileURL = lexiconFileURL; } public URL getNgramFileURL() { return ngramFileURL; } @CreoleParameter public void setNgramFileURL(URL ngramFileURL) { this.ngramFileURL = ngramFileURL; } public URL getGuesserLexiconFileURL() { return guesserLexiconFileURL; } @CreoleParameter public void setGuesserLexiconFileURL(URL guesserLexiconFileURL) { this.guesserLexiconFileURL = guesserLexiconFileURL; } public URL getGuesserNgramFileURL() { return guesserNgramFileURL; } @CreoleParameter public void setGuesserNgramFileURL(URL guesserNgramFileURL) { this.guesserNgramFileURL = guesserNgramFileURL; } public String getInputASName() { return inputASName; } @Optional @RunTime @CreoleParameter public void setInputASName(String inputASName) { this.inputASName = inputASName; } public String getOutputASName() { return outputASName; } @Optional @RunTime @CreoleParameter public void setOutputASName(String outputASName) { this.outputASName = outputASName; } } // class POSTagger