package de.unihd.dbs.uima.annotator.treetagger;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import org.apache.uima.jcas.JCas;
import de.unihd.dbs.uima.types.heideltime.Sentence;
import de.unihd.dbs.uima.types.heideltime.Token;
public class TreeTaggerReader implements Runnable {
private List<Token> tokens;
private BufferedReader reader;
private JCas jcas;
private Boolean annotate_sentences;
private int i; // position in list
// Possible End-of-Sentence Tags
private static final HashSet<String> hsEndOfSentenceTag = new HashSet<String>(
Arrays.asList(new String[] {
"SENT", // ENGLISH, FRENCH, GREEK, ...
"$.", // GERMAN, DUTCH
"FS", // SPANISH
"_Z_Fst", // ESTONIAN
"_Z_Int", // ESTONIAN
"_Z_Exc", // ESTONIAN
"ew", // CHINESE
})
);
public TreeTaggerReader(List<Token> tokens, BufferedReader reader, JCas jcas, Boolean annotate_sentences) {
this.tokens = tokens;
this.reader = reader;
this.jcas = jcas;
this.annotate_sentences = annotate_sentences;
}
@Override
public void run() {
i = 0;
try {
Boolean isStarted = false;
Sentence sentence = null;
String s = null;
// wait for the starting token to arrive
while(null != (s = reader.readLine()) && !isStarted) {
if(TreeTaggerProperties.STARTOFTEXT.equals(s)) {
isStarted = true;
break;
}
}
// iterate over all the output lines and tokens array (which have the same source and are hence symmetric)
while(null != (s = reader.readLine()) && isStarted) {
// check whether this is the finalizing token
if(TreeTaggerProperties.ENDOFTEXT.equals(s)) {
isStarted = false;
break;
}
// do splitting
String[] splits = s.split("\t", 2);
String tokenStr = splits[0].trim(), pos = "";
if(splits.length == 2) {
pos = splits[1].trim();
}
// grab a token
Token token = tokens.get(i++);
// Handle empty tokens (such as empty lines) in input file
while("".equals(token.getCoveredText())) {
// if part of the configuration, also add sentences to the jcas document
if("EMPTYLINE".equals(token.getPos()) && annotate_sentences) {
// Establish sentence structure
if(sentence == null) {
sentence = new Sentence(jcas);
sentence.setBegin(token.getBegin());
}
// Finish current sentence if end-of-sentence pos was found or document ended
sentence.setEnd(token.getEnd());
if(sentence.getBegin() < sentence.getEnd()){
sentence.addToIndexes();
}
// Make sure current sentence is not active anymore so that a new one might be created
sentence = null;
}
token.removeFromIndexes();
token = tokens.get(i++);
}
// remove tokens, otherwise they are in the index twice
token.removeFromIndexes();
// set part of speech tag and add to indexes again
if(!"".equals(token.getCoveredText())) {
token.setPos(pos);
token.addToIndexes();//System.out.println("added token " + token.getCoveredText() + " -> " + pos); // TODO
}
// if part of the configuration, also add sentences to the jcas document
if(annotate_sentences) {
// Establish sentence structure
if(sentence == null) {
sentence = new Sentence(jcas);
sentence.setBegin(token.getBegin());
}
// Finish current sentence if end-of-sentence pos was found or document ended
if(hsEndOfSentenceTag.contains(pos) || i == tokens.size()) {
sentence.setEnd(token.getEnd());
sentence.addToIndexes();
// Make sure current sentence is not active anymore so that a new one might be created
sentence = null;
}
}
}
while(i < tokens.size()) {
Token currentToken = tokens.get(i++);
if(sentence != null) {
sentence.setEnd(tokens.get(tokens.size() - 1).getEnd());
sentence.addToIndexes();
}
if("EMPTYLINE".equals(currentToken.getPos())) {
currentToken.removeFromIndexes();
}
}
} catch(IOException e) {
e.printStackTrace();
}
}
}