/*
* POSTagger.java
*
*/
package org.opensextant.toolbox;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.danieldk.nlp.jitar.data.Model;
import eu.danieldk.nlp.jitar.languagemodel.LanguageModel;
import eu.danieldk.nlp.jitar.languagemodel.LinearInterpolationLM;
import eu.danieldk.nlp.jitar.tagger.HMMTagger;
import eu.danieldk.nlp.jitar.tagger.HMMTagger.Sequence;
import eu.danieldk.nlp.jitar.wordhandler.KnownWordHandler;
import eu.danieldk.nlp.jitar.wordhandler.SuffixWordHandler;
import eu.danieldk.nlp.jitar.wordhandler.WordHandler;
import gate.Annotation;
import gate.AnnotationSet;
import gate.ProcessingResource;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
@CreoleResource(name = "OpenSextant_POS_Tagger", comment = "A POS tagger based on the JITAR ngram model")
public class POSTagger extends AbstractLanguageAnalyser implements ProcessingResource {
private static final long serialVersionUID = 1L;
private String inputASName;
private String outputASName;
/** The lexicon and ngrams used by the tagger. */
private URL lexiconFileURL;
private URL ngramFileURL;
/** The lexicon and ngrams used by the gueser algoithm (suffix handler). */
private URL guesserLexiconFileURL;
private URL guesserNgramFileURL;
private transient HMMTagger tagger;
private transient Model model;
private transient Model guesserModel;
/** Log object. */
private static final Logger LOGGER = LoggerFactory.getLogger(POSTagger.class);
private void initialize() {
// Load the model
try {
model = Model.readModel(new BufferedReader(new InputStreamReader(lexiconFileURL.openStream(), "UTF-8")),
new BufferedReader(new InputStreamReader(ngramFileURL.openStream(), "UTF-8")));
guesserModel = Model.readModel(
new BufferedReader(new InputStreamReader(guesserLexiconFileURL.openStream(), "UTF-8")),
new BufferedReader(new InputStreamReader(guesserNgramFileURL.openStream(), "UTF-8")));
} catch (IOException e) {
LOGGER.error("Unable to read the POS model!", e);
}
// Set up word handlers. The suffix word handler is used as a fallback
// to the known word handler.
int maxSuffixLength = 3; // low long a suffix to use to guess unknown
// words
int maxTrainFreqNum = 5; // max freq for numerical words
int maxTrainFreqUppercase = 100000000; // max freq for uppercase words
int maxTrainFreqLowercase = 100000000; // max freq for lower case words
SuffixWordHandler swh = new SuffixWordHandler(guesserModel.lexicon(), model.uniGrams(), maxSuffixLength,
maxTrainFreqNum, maxTrainFreqUppercase, maxTrainFreqLowercase, 10);
WordHandler wh = new KnownWordHandler(model.lexicon(), model.uniGrams(), swh);
// Create an n-gram language model.
LanguageModel lm = new LinearInterpolationLM(model.uniGrams(), model.biGrams(), model.triGrams());
// Initialize a tagger with a beam of 1000.0.
tagger = new HMMTagger(model, wh, lm, 1000.0);
}
@Override
public Resource init() throws ResourceInstantiationException {
initialize();
return this;
}
@Override
public void reInit() throws ResourceInstantiationException {
initialize();
}
@Override
public void execute() throws ExecutionException {
if ("".equals(inputASName)) {
inputASName = null;
}
AnnotationSet inputAS = (inputASName == null) ? document.getAnnotations()
: document.getAnnotations(inputASName);
// Get all of the sentences in document
AnnotationSet sentenceSet = inputAS.get("Sentence");
// For every sentence:
// Get the tokens which make up that sentence,
// first as Annotation[], then as a List<String>
Iterator<Annotation> sentIter = sentenceSet.iterator();
while (sentIter.hasNext()) {
Annotation currSent = sentIter.next();
Long start = currSent.getStartNode().getOffset();
Long end = currSent.getEndNode().getOffset();
// Get the Tokens within the current sentence
AnnotationSet tokensInSentence = inputAS.get("Token", start, end);
List<Annotation> annoList = gate.Utils.inDocumentOrder(tokensInSentence);
List<String> tokenList = new ArrayList<String>();
for (int i = 0; i < annoList.size(); i++) {
// ASSUMPTION: every token has a feature "string"
String tmpString = (String) annoList.get(i).getFeatures().get("string");
tokenList.add(tmpString);
}
// Add start/end markers, 2 starts and 1 end.
tokenList.add(0, "<START>");
tokenList.add(0, "<START>");
tokenList.add("<END>");
int indexOffset = 2;
String featureName = "pos";
// Send the tokens to the tagger
Sequence seq = HMMTagger.highestProbabilitySequence(tagger.viterbi(tokenList), model);
// Set the probability on the Sentence as feature "posProb"
currSent.getFeatures().put("posProb", seq.logProb());
// get the tags from the sequence
List<String> tags = seq.sequence();
// Attach the returned tag to each token as feature "pos"
// skipping the 2 <START> and 1 <END> tags
for (int i = 0; i < annoList.size(); i++) {
annoList.get(i).getFeatures().put(featureName, tags.get(i + indexOffset));
}
} // end sentence iterator
}
/** End execute(). */
public URL getLexiconFileURL() {
return lexiconFileURL;
}
@CreoleParameter
public void setLexiconFileURL(URL lexiconFileURL) {
this.lexiconFileURL = lexiconFileURL;
}
public URL getNgramFileURL() {
return ngramFileURL;
}
@CreoleParameter
public void setNgramFileURL(URL ngramFileURL) {
this.ngramFileURL = ngramFileURL;
}
public URL getGuesserLexiconFileURL() {
return guesserLexiconFileURL;
}
@CreoleParameter
public void setGuesserLexiconFileURL(URL guesserLexiconFileURL) {
this.guesserLexiconFileURL = guesserLexiconFileURL;
}
public URL getGuesserNgramFileURL() {
return guesserNgramFileURL;
}
@CreoleParameter
public void setGuesserNgramFileURL(URL guesserNgramFileURL) {
this.guesserNgramFileURL = guesserNgramFileURL;
}
public String getInputASName() {
return inputASName;
}
@Optional
@RunTime
@CreoleParameter
public void setInputASName(String inputASName) {
this.inputASName = inputASName;
}
public String getOutputASName() {
return outputASName;
}
@Optional
@RunTime
@CreoleParameter
public void setOutputASName(String outputASName) {
this.outputASName = outputASName;
}
} // class POSTagger