/**
* This is a preprocessing engine for use in a UIMA pipeline. It will invoke
* the tree-tagger binary that is supposed to be available on the system
* through Java process access.
*/
package de.unihd.dbs.uima.annotator.treetagger;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashSet;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.impl.RootUimaContext_impl;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ConfigurationManager;
import org.apache.uima.resource.impl.ConfigurationManager_impl;
import org.apache.uima.resource.impl.ResourceManager_impl;
import de.unihd.dbs.uima.annotator.heideltime.resources.Language;
import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
import de.unihd.dbs.uima.types.heideltime.Sentence;
import de.unihd.dbs.uima.types.heideltime.Token;
/**
* @author Andreas Fay, Julian Zell
*
*/
public class TreeTaggerWrapper extends JCasAnnotator_ImplBase {
private Class<?> component = this.getClass();
// definitions of what names these parameters have in the wrapper's descriptor file
public static final String PARAM_LANGUAGE = "language";
public static final String PARAM_ANNOTATE_TOKENS = "annotate_tokens";
public static final String PARAM_ANNOTATE_SENTENCES = "annotate_sentences";
public static final String PARAM_ANNOTATE_PARTOFSPEECH = "annotate_partofspeech";
public static final String PARAM_IMPROVE_GERMAN_SENTENCES = "improvegermansentences";
// language for this instance of the treetaggerwrapper
private Language language;
// switches for annotation parameters
private Boolean annotate_tokens = false;
private Boolean annotate_sentences = false;
private Boolean annotate_partofspeech = false;
private Boolean improve_german_sentences = false;
// local treetagger properties container, see below
private TreeTaggerProperties ttprops = new TreeTaggerProperties();
/**
* An embedded class that contains all of the treetagger-related settings.
* @author Julian Zell
*
*/
private class TreeTaggerProperties {
// treetagger language name for par files
public String languageName = null;
// absolute path of the treetagger
public String rootPath = null;
// Files for tokenizer and part of speech tagger (standard values)
public String tokScriptName = null;
public String parFileName = null;
public String abbFileName = null;
// english, italian, and french tagger models require additional splits (see tagger readme)
public String languageSwitch = null;
// perl requires(?) special hint for utf-8-encoded input/output (see http://perldoc.perl.org/perlrun.html#Command-Switches -C)
// The input text is read in HeidelTimeStandalone.java and always translated into UTF-8,
// i.e., switch always "-CSD"
public String utf8Switch = "-CSD";
// save System-specific separators for string generation
public String newLineSeparator = System.getProperty("line.separator");
public String fileSeparator = System.getProperty("file.separator");
}
/**
* uimacontext to make secondary initialize() method possible.
* -> programmatic, non-uima pipeline usage.
* @author julian
*
*/
private class TreeTaggerContext extends RootUimaContext_impl {
public TreeTaggerContext(Language language, Boolean annotateTokens, Boolean annotateSentences,
Boolean annotatePartOfSpeech, Boolean improveGermanSentences) {
super();
// Initialize config
ConfigurationManager configManager = new ConfigurationManager_impl();
// Initialize context
this.initializeRoot(null, new ResourceManager_impl(), configManager);
// Set session
configManager.setSession(this.getSession());
// Set necessary variables
configManager.setConfigParameterValue(makeQualifiedName(PARAM_LANGUAGE), language.getName());
configManager.setConfigParameterValue(makeQualifiedName(PARAM_ANNOTATE_TOKENS), annotateTokens);
configManager.setConfigParameterValue(makeQualifiedName(PARAM_ANNOTATE_PARTOFSPEECH), annotatePartOfSpeech);
configManager.setConfigParameterValue(makeQualifiedName(PARAM_ANNOTATE_SENTENCES), annotateSentences);
configManager.setConfigParameterValue(makeQualifiedName(PARAM_IMPROVE_GERMAN_SENTENCES), improveGermanSentences);
}
}
/**
* secondary initialize() to use wrapper outside of a uima pipeline
* @param language
* @param treeTaggerHome
* @param annotateTokens
* @param annotateSentences
* @param annotatePartOfSpeech
* @param improveGermanSentences
*/
public void initialize(Language language, String treeTaggerHome, Boolean annotateTokens,
Boolean annotateSentences, Boolean annotatePartOfSpeech, Boolean improveGermanSentences) {
this.setHome(treeTaggerHome);
TreeTaggerContext ttContext = new TreeTaggerContext(language, annotateTokens,
annotateSentences, annotatePartOfSpeech, improveGermanSentences);
this.initialize(ttContext);
}
/**
* initialization method where we fill configuration values and check some prerequisites
*/
public void initialize(UimaContext aContext) {
// check if the supplied language is one that we can currently handle
this.language = Language.getLanguageFromString((String) aContext.getConfigParameterValue(PARAM_LANGUAGE));
// get configuration from the descriptor
annotate_tokens = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_TOKENS);
annotate_sentences = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_SENTENCES);
annotate_partofspeech = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_PARTOFSPEECH);
improve_german_sentences = (Boolean) aContext.getConfigParameterValue(PARAM_IMPROVE_GERMAN_SENTENCES);
// set some configuration based upon these values
ttprops.languageName = language.getTreeTaggerLangName();
if(ttprops.rootPath == null)
ttprops.rootPath = System.getenv("TREETAGGER_HOME");
ttprops.tokScriptName = "utf8-tokenize.perl";
// parameter file
if(!(new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.languageName + "-utf8.par").exists())) // get UTF8 version if it exists
ttprops.parFileName = ttprops.languageName + ".par";
else
ttprops.parFileName = ttprops.languageName + "-utf8.par";
// abbreviation file
if(!(new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.languageName + "-abbreviations-utf8").exists())) // get UTF8 version if it exists
ttprops.abbFileName = ttprops.languageName + "-abbreviations";
else
ttprops.abbFileName = ttprops.languageName + "-abbreviations-utf8";
ttprops.languageSwitch = language.getTreeTaggerSwitch();
// handle the treetagger path from the environment variables
if(ttprops.rootPath == null) {
Logger.printError("TreeTagger environment variable is not present, aborting.");
System.exit(-1);
}
// Check for whether the required treetagger parameter files are present
Boolean abbFileFlag = true;
Boolean parFileFlag = true;
Boolean tokScriptFlag = true;
File abbFile = new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.abbFileName);
File parFile = new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.parFileName);
File tokFile = new File(ttprops.rootPath+ttprops.fileSeparator+"cmd", ttprops.tokScriptName);
if (!(abbFileFlag = abbFile.exists())) {
Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.abbFileName);
}
if (!(parFileFlag = parFile.exists())) {
Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.parFileName);
}
if (!(tokScriptFlag = tokFile.exists())) {
Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.tokScriptName);
}
if (!abbFileFlag || !parFileFlag || !tokScriptFlag) {
Logger.printError(component, "Cannot find tree tagger (" + ttprops.rootPath + ttprops.fileSeparator
+ "cmd" + ttprops.fileSeparator + ttprops.tokScriptName + ")." +
" Make sure that path to tree tagger is set correctly in config.props!");
Logger.printError(component, "If path is set correctly:");
Logger.printError(component, "Maybe you need to download the TreeTagger tagger-scripts.tar.gz");
Logger.printError(component, "from http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz");
Logger.printError(component, "Extract this file and copy the missing file into the corresponding TreeTagger directories.");
Logger.printError(component, "If missing, copy " + ttprops.abbFileName + " into " + ttprops.rootPath+ttprops.fileSeparator+"lib");
Logger.printError(component, "If missing, copy " + ttprops.parFileName + " into " + ttprops.rootPath+ttprops.fileSeparator+"lib");
Logger.printError(component, "If missing, copy " + ttprops.tokScriptName + " into " + ttprops.rootPath+ttprops.fileSeparator+"cmd");
System.exit(-1);
}
}
/**
* Method that gets called to process the documents' cas objects
*/
public void process(JCas jcas) throws AnalysisEngineProcessException {
// if the annotate_tokens flag is set, annotate the tokens and add them to the jcas
if(annotate_tokens)
tokenize(jcas);
/* if the annotate_partofspeech flag is set, annotate partofspeech and,
* if specified, also tag sentences based upon the partofspeech tags.
*/
if(annotate_partofspeech)
doTreeTag(jcas);
// if the improve_german_sentences flag is set, improve the sentence tokens made by the treetagger
if(improve_german_sentences)
improveGermanSentences(jcas);
}
/**
* tokenizes a given JCas object's document text using the treetagger program
* and adds the recognized tokens to the JCas object.
* @param jcas JCas object supplied by the pipeline
*/
private void tokenize(JCas jcas) {
BufferedWriter tmpFileWriter = null;
File tmpDocument = null;
BufferedReader in = null;
try {
// Create temp file containing the document text
tmpDocument = File.createTempFile("pos", null);
tmpFileWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpDocument), "UTF-8"));
tmpFileWriter.write(jcas.getDocumentText());
tmpFileWriter.close();
// assemble a command line for the tokenization script and execute it
ArrayList<String> command = new ArrayList<String>();
command.add("perl");
if(ttprops.utf8Switch != "")
command.add(ttprops.utf8Switch);
command.add(ttprops.rootPath + ttprops.fileSeparator + "cmd" + ttprops.fileSeparator + ttprops.tokScriptName);
if(ttprops.languageSwitch != "")
command.add(ttprops.languageSwitch);
command.add("-a");
command.add(ttprops.rootPath + ttprops.fileSeparator + "lib" + ttprops.fileSeparator + ttprops.abbFileName);
command.add(tmpDocument.getAbsolutePath());
String[] commandStr = new String[command.size()];
command.toArray(commandStr);
Process p = Runtime.getRuntime().exec(commandStr);
Logger.printDetail(component, "TreeTagger (tokenization) with: " + ttprops.tokScriptName + " and " + ttprops.abbFileName);
// read tokenized text to add tokens to the jcas
in = new BufferedReader(new InputStreamReader(p.getInputStream(), "UTF-8"));
String s;
int tokenOffset = 0;
// loop through all the lines in the treetagger output
while ((s = in.readLine()) != null) {
// charset missmatch fallback: signal (invalid) s
if (jcas.getDocumentText().indexOf(s, tokenOffset) < 0)
throw new RuntimeException("Opps! Could not find token "+s+
" in JCas after tokenizing with TreeTagger." +
" Hmm, there may exist a charset missmatch!" +
" Default encoding is " + Charset.defaultCharset().name() +
" and should always be UTF-8 (use -Dfile.encoding=UTF-8)." +
" If input document is not UTF-8 use -e option to set it according to the input, additionally.");
// create tokens and add them to the jcas's indexes.
Token newToken = new Token(jcas);
newToken.setBegin(jcas.getDocumentText().indexOf(s, tokenOffset));
newToken.setEnd(newToken.getBegin() + s.length());
newToken.addToIndexes();
tokenOffset = newToken.getEnd();
}
// clean up
in.close();
p.destroy();
tmpDocument.delete();
} catch (Exception e) {
e.printStackTrace();
} finally {
// I/O Housekeeping
if (tmpFileWriter != null) {
try {
tmpFileWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
// Delete temp files
tmpDocument.delete();
}
if (in != null) {
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
* based on tokens from the jcas object, adds part of speech (POS) and sentence
* tags to the jcas object using the treetagger program.
* @param jcas JCas object supplied by the pipeline
*/
private void doTreeTag(JCas jcas) {
File tmpDocument = null;
BufferedWriter tmpFileWriter;
ArrayList<Token> tokens = new ArrayList<Token>();
try {
// create a temporary file and write our pre-existing tokens to it.
tmpDocument = File.createTempFile("postokens", null);
tmpFileWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpDocument), "UTF-8"));
// iterate over existing tokens
FSIterator ai = jcas.getAnnotationIndex(Token.type).iterator();
while(ai.hasNext()) {
Token t = (Token) ai.next();
tokens.add(t);
tmpFileWriter.write(t.getCoveredText() + ttprops.newLineSeparator);
}
tmpFileWriter.close();
} catch(IOException e) {
Logger.printError("Something went wrong creating a temporary file for the treetagger to process.");
System.exit(-1);
}
// Possible End-of-Sentence Tags
HashSet<String> hsEndOfSentenceTag = new HashSet<String>();
hsEndOfSentenceTag.add("SENT"); // ENGLISH, FRENCH, GREEK,
hsEndOfSentenceTag.add("$."); // GERMAN, DUTCH
hsEndOfSentenceTag.add("FS"); // SPANISH
hsEndOfSentenceTag.add("_Z_Fst"); // ESTONIAN
hsEndOfSentenceTag.add("_Z_Int"); // ESTONIAN
hsEndOfSentenceTag.add("_Z_Exc"); // ESTONIAN
try {
// assemble a command line based on configuration and execute the POS tagging.
ArrayList<String> command = new ArrayList<String>();
command.add(ttprops.rootPath + ttprops.fileSeparator + "bin" + ttprops.fileSeparator + "tree-tagger");
command.add(ttprops.rootPath + ttprops.fileSeparator + "lib" + ttprops.fileSeparator + ttprops.parFileName);
command.add(tmpDocument.getAbsolutePath());
command.add("-no-unknown");
String[] commandStr = new String[command.size()];
command.toArray(commandStr);
Process p = Runtime.getRuntime().exec(commandStr);
Logger.printDetail(component, "TreeTagger (pos tagging) with: " + ttprops.parFileName);
BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream(), "UTF-8"));
Sentence sentence = null;
// iterate over all the output lines and tokens array (which have the same source and are hence symmetric)
int i = 0;
String s = null;
while ((s = in.readLine()) != null) {
// grab a token
Token token = tokens.get(i++);
// modified (Aug 29, 2011): Handle empty tokens (such as empty lines) in input file
while (token.getCoveredText().equals("")){
token.setPos("");
token.addToIndexes();
token = tokens.get(i++);
}
// remove tokens, otherwise they are in the index twice
token.removeFromIndexes();
// set part of speech tag and add to indexes again
token.setPos(s);
token.addToIndexes();
// if part of the configuration, also add sentences to the jcas document
if(annotate_sentences) {
// Establish sentence structure
if (sentence == null) {
sentence = new Sentence(jcas);
sentence.setBegin(token.getBegin());
}
// Finish current sentence if end-of-sentence pos was found or document ended
if (hsEndOfSentenceTag.contains(s) || i == tokens.size()) {
sentence.setEnd(token.getEnd());
sentence.addToIndexes();
// Make sure current sentence is not active anymore so that a new one might be created
sentence = null;
}
}
}
in.close();
p.destroy();
} catch (Exception e) {
e.printStackTrace();
} finally {
// Delete temporary files
tmpDocument.delete();
}
}
public void setHome(String home) {
this.ttprops.rootPath = home;
}
/**
* improve german sentences; the treetagger splits german sentences incorrectly on some occasions
* @param jcas JCas object supplied by the pipeline
*/
private void improveGermanSentences(JCas jcas) {
HashSet<String> hsSentenceBeginnings = new HashSet<String>();
hsSentenceBeginnings.add("Januar");
hsSentenceBeginnings.add("Februar");
hsSentenceBeginnings.add("März");
hsSentenceBeginnings.add("April");
hsSentenceBeginnings.add("Mai");
hsSentenceBeginnings.add("Juni");
hsSentenceBeginnings.add("Juli");
hsSentenceBeginnings.add("August");
hsSentenceBeginnings.add("September");
hsSentenceBeginnings.add("Oktober");
hsSentenceBeginnings.add("November");
hsSentenceBeginnings.add("Dezember");
hsSentenceBeginnings.add("Jahrhundert");
hsSentenceBeginnings.add("Jahr");
hsSentenceBeginnings.add("Monat");
hsSentenceBeginnings.add("Woche");
HashSet<de.unihd.dbs.uima.types.heideltime.Sentence> hsRemoveAnnotations = new HashSet<de.unihd.dbs.uima.types.heideltime.Sentence>();
HashSet<de.unihd.dbs.uima.types.heideltime.Sentence> hsAddAnnotations = new HashSet<de.unihd.dbs.uima.types.heideltime.Sentence>();
Boolean changes = true;
while (changes) {
changes = false;
FSIndex annoHeidelSentences = jcas.getAnnotationIndex(de.unihd.dbs.uima.types.heideltime.Sentence.type);
FSIterator iterHeidelSent = annoHeidelSentences.iterator();
while (iterHeidelSent.hasNext()){
de.unihd.dbs.uima.types.heideltime.Sentence s1 = (de.unihd.dbs.uima.types.heideltime.Sentence) iterHeidelSent.next();
int substringOffset = java.lang.Math.max(s1.getCoveredText().length()-4,1);
if (s1.getCoveredText().substring(substringOffset).matches(".*[\\d]+\\.[\\s\\n]*$")){
if (iterHeidelSent.hasNext()){
de.unihd.dbs.uima.types.heideltime.Sentence s2 = (de.unihd.dbs.uima.types.heideltime.Sentence) iterHeidelSent.next();
iterHeidelSent.moveToPrevious();
for (String beg : hsSentenceBeginnings){
if (s2.getCoveredText().startsWith(beg)){
de.unihd.dbs.uima.types.heideltime.Sentence s3 = new de.unihd.dbs.uima.types.heideltime.Sentence(jcas);
s3.setBegin(s1.getBegin());
s3.setEnd(s2.getEnd());
hsAddAnnotations.add(s3);
hsRemoveAnnotations.add(s1);
hsRemoveAnnotations.add(s2);
changes = true;
break;
}
}
}
}
}
for (de.unihd.dbs.uima.types.heideltime.Sentence s : hsRemoveAnnotations){
s.removeFromIndexes(jcas);
}
hsRemoveAnnotations.clear();
for (de.unihd.dbs.uima.types.heideltime.Sentence s : hsAddAnnotations){
s.addToIndexes(jcas);
}
hsAddAnnotations.clear();
}
}
}