package hr.fer.zemris.takelab.uima.annotator.hunpos;
import hr.fer.zemris.takelab.splitter.TokenSplitter;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.impl.RootUimaContext_impl;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ConfigurationManager;
import org.apache.uima.resource.impl.ConfigurationManager_impl;
import org.apache.uima.resource.impl.ResourceManager_impl;
import de.unihd.dbs.uima.annotator.heideltime.resources.Language;
import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
import de.unihd.dbs.uima.types.heideltime.Sentence;
import de.unihd.dbs.uima.types.heideltime.Token;
/**
* A wrapper around the CSTLemma lemmatiser and the HunPos POS tagger, to be used as a pre-processing engine in the UIMA pipeline.
* Currently only offers support for the Croatian language.
* @version 0.9
* @author Luka Skukan
*
*/
public class HunPosTaggerWrapper extends JCasAnnotator_ImplBase{
public static final String PARAM_LANGUAGE = "language";
public static final String PARAM_PATH = "hunpos_path";
public static final String PARAM_MODEL_PATH = "model_path";
public static final String PARAM_ANNOTATE_TOKENS = "annotate_tokens";
public static final String PARAM_ANNOTATE_SENTENCES = "annotate_sentences";
public static final String PARAM_ANNOTATE_POS = "annotate_pos";
/**
* The language used by the instance of the wrapper
*/
private Language language;
/**
* Indicates whether token annotation occurs
*/
private boolean annotate_tokens;
/**
* Indicates whether sentence annotation occurs, with POS annotation being a prerequisite for it
*/
private boolean annotate_sentences;
/**
* Indicates whether Part-Of-Speech annotation occurs
*/
private boolean annotate_pos;
/**
* Initializes the wrapper with the given language and settings what to annotate. Sentences will not be annotated, even if set to True, unless POS annotation occurs.
* @param language Language used by the wrapper, determines which rule files are read
* @param annotateTokens Are tokens to be annotated?
* @param annotateSentences Are sentences to be annotated?
* @param annotatePOS Is POS to be annotated?
*/
public void initialize(Language language, String hunpos_path, String hunpos_model_path, Boolean annotateTokens, Boolean annotateSentences, Boolean annotatePOS) {
this.initialize(new HunPosTaggerContext(language, hunpos_path, hunpos_model_path, annotateTokens, annotateSentences, annotatePOS));
}
/**
* Initializes the wrapper from UIMA context. See other initialize method for parameters required within context.
*/
public void initialize(UimaContext aContext) {
annotate_tokens = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_TOKENS);
annotate_sentences = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_SENTENCES);
annotate_pos = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_POS);
this.language = Language.getLanguageFromString((String) aContext.getConfigParameterValue(PARAM_LANGUAGE));
String hunposPath = (String) aContext.getConfigParameterValue(PARAM_PATH);
String modelPath = (String) aContext.getConfigParameterValue(PARAM_MODEL_PATH);
HunPosWrapper.initialize(modelPath, hunposPath);
}
@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
if(annotate_tokens) {
TokenSplitterWrapper.splitTokens(aJCas);
}
if(annotate_pos) {
HunPosWrapper.tagPOS(aJCas, annotate_sentences);
}
if(language == Language.CROATIAN) {
fixCroatianSentences(aJCas);
}
}
/**
* A simple wrapper over the {@link TokenSplitter} class which invokes it over a {@link JCas} object and produces word tokens
* out of the text in the JCas covered document.
* @author Luka Skukan
*
*/
private static class TokenSplitterWrapper {
/**
* Takes a document wrapped in the {@link JCas} object and records all tokens within the covered text as
* {@link Token} objects which are added to the JCas indexes.
* @param jcas
*/
public static void splitTokens(JCas jcas) {
List<String> tokens = TokenSplitter.getTokens(jcas.getDocumentText());
int tokenOffset = 0;
for(String token : tokens) {
if (jcas.getDocumentText().indexOf(token, tokenOffset) < 0)
throw new RuntimeException("Opps! Could not find token "+ token +
" in JCas after tokenizing with token splitter for Croatian." +
" Hmm, there may exist a charset missmatch!" +
" Default encoding is " + Charset.defaultCharset().name() +
" and should always be UTF-8.");
// create tokens and add them to the jcas's indexes.
Token newToken = new Token(jcas);
newToken.setBegin(jcas.getDocumentText().indexOf(token, tokenOffset));
newToken.setEnd(newToken.getBegin() + token.length());
newToken.addToIndexes();
tokenOffset = newToken.getEnd();
}
}
}
private static class HunPosWrapper {
private static List<String> command;
public static final String HUNPOS_HOME = "HUNPOS_HOME";
@SuppressWarnings("unused")
public static void initialize(String modelPath) {
initialize(modelPath, null);
}
public static void initialize(String modelPath, String hunposPath) {
String hunposRoot = hunposPath;
if(hunposRoot == null) {
hunposRoot = System.getenv(HUNPOS_HOME);
}
if(hunposRoot == null || !new File(hunposRoot).exists()) {
Logger.printError(HunPosWrapper.class, "The environment variable HUNPOS_HOME was not set, or set to \"" + hunposRoot + "\", which does not exist.");
System.exit(-1);
}
File hunPosRootFile = new File(hunposRoot);
command = new ArrayList<String>();
command.add(hunposRoot + "/hunpos-tag"); //Constructing a tagger call
File modelFile = new File(hunPosRootFile, modelPath);
if(modelFile.exists()) {
command.add(modelFile.getAbsolutePath());
} else {
Logger.printError(HunPosWrapper.class, "The supplied model path " + modelPath + " does not exist.");
System.exit(-1);
}
}
public static void tagPOS(JCas jCas, boolean tagSentences) {
Process p = null;
String[] cmd = new String[command.size()];
command.toArray(cmd);
try {
p = Runtime.getRuntime().exec(cmd);
} catch (IOException e2) {
Logger.printError(HunPosWrapper.class, "An error occured while trying to call HunPos at " + System.getenv(HUNPOS_HOME));
e2.printStackTrace();
}
Writer writer = new OutputStreamWriter(p.getOutputStream());
Logger.printDetail(HunPosWrapper.class, "Starting the POS tagging process.");
final List<Token> tokens = new ArrayList<Token>();
FSIterator ai = jCas.getAnnotationIndex(Token.type).iterator();
while(ai.hasNext()) {
Token t = (Token) ai.next();
tokens.add(t);
}
class TaggingJob implements Runnable {
private final Pattern HUNPOS_PATTERN = Pattern.compile("^(.+)\t([^\t]+)$");
private JCas jCas;
private List<Token> tokens;
private boolean tagSentences;
private InputStream input;
private final String terminal = "Z";
private HunPosAnnotionTranslator trans = new HunPosAnnotionTranslator();
public TaggingJob(JCas jCas, List<Token> tokens, boolean tagSentences, InputStream input) {
this.jCas = jCas;
this.tokens = tokens;
this.tagSentences = tagSentences;
this.input = input;
}
@Override
public void run() {
InputStreamReader ir = new InputStreamReader(new BufferedInputStream(input), Charset.forName("UTF-8"));
Scanner scan = new Scanner(ir);
int i = 0;
String s = null;
Sentence sentence = null;
try {
while(true) {
if(!scan.hasNextLine()) {
break;
}
s = scan.nextLine().trim();
if(s.isEmpty()) continue;
Token token = tokens.get(i++);
while (token.getCoveredText().isEmpty()){
token.setPos("");
token.addToIndexes();
token = tokens.get(i++);
}
Matcher m = HUNPOS_PATTERN.matcher(s);
if(m.find()) {
s = m.group(2);
} else {
i--;
}
token.removeFromIndexes();
token.setPos(trans.translate(s));
token.addToIndexes();
if(tagSentences) {
if (sentence == null) {
sentence = new Sentence(jCas);
sentence.setBegin(token.getBegin());
}
if (terminal.equals(s) || i == tokens.size()) {
sentence.setEnd(token.getEnd());
sentence.addToIndexes();
sentence = null;
}
}
}
scan.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
Thread thr = new Thread(new TaggingJob(jCas, tokens, tagSentences, p.getInputStream()));
thr.start();
for(Token t : tokens) {
try {
writer.write(t.getCoveredText() + "\n");
} catch (IOException e) {
e.printStackTrace();
}
}
try {
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
thr.join();
p.waitFor();
} catch (InterruptedException e1) {
e1.printStackTrace();
}
}
}
private void fixCroatianSentences(JCas jCas) {
final String reBeginsWithMonth =
"^(Siječ(anj|nja)|Veljač[ae]|Ožuj(ak|ka)|Trav(anj|nja)|Svib(anj|nja)|Lip(anj|nja)|Srp(anj|nja)|Kolovoza?|Ruj(an|na)|Listopada?|Studen(i|og)|Prosin(ac|ca)).*";
final String reBeginsWithUppercase = "^[A-ZŠĐČĆŽ].*";
final String reEndsWithDate = "(?s).*\\d{1,4}\\.$";
final String reFalseSentenceEnd = "(?s)^.*(\\s[A-Z]\\.|[:;,%\"\\(\\)\\-])$";
FSIndex annoHeidelSentences = jCas.getAnnotationIndex(de.unihd.dbs.uima.types.heideltime.Sentence.type);
FSIterator iterHeidelSent = annoHeidelSentences.iterator();
HashSet<Sentence> hsNewAnnotations = new HashSet<Sentence>();
HashSet<Sentence> hsOldAnnotations = new HashSet<Sentence>();
boolean prevIsDate = false;
boolean prevIsFalseEnd = false;
Sentence sOld = null;
while(iterHeidelSent.hasNext()) {
Sentence s = (Sentence) iterHeidelSent.next();
String text = s.getCoveredText();
//If the previous sentence ended in a day or month and the next one is not uppercase, or is an uppercase month, merge them
if(prevIsFalseEnd || (prevIsDate && (!text.matches(reBeginsWithUppercase) || (text.matches(reBeginsWithUppercase) && text.matches(reBeginsWithMonth))))) {
Sentence sMerged = new Sentence(jCas);
sMerged.setBegin(sOld.getBegin());
sMerged.setEnd(s.getEnd());
if(hsNewAnnotations.contains(sOld)) {
hsNewAnnotations.remove(sOld);
}
hsNewAnnotations.add(sMerged);
prevIsDate = false;
prevIsFalseEnd = false;
sOld = sMerged;
text = sOld.getCoveredText();
} else {
if(!hsNewAnnotations.contains(s)) {
hsNewAnnotations.add(s);
}
sOld = s;
}
if(text.matches(reEndsWithDate)) {
prevIsDate = true;
}
if(text.matches(reFalseSentenceEnd)) {
prevIsFalseEnd = true;
}
}
iterHeidelSent.moveToFirst();
while(iterHeidelSent.hasNext()) hsOldAnnotations.add((Sentence)iterHeidelSent.next());
for(Sentence s : hsOldAnnotations) s.removeFromIndexes(jCas);
for(Sentence s : hsNewAnnotations) s.addToIndexes(jCas);
}
private class HunPosTaggerContext extends RootUimaContext_impl {
public HunPosTaggerContext(Language language, String hunpos_path, String hunpos_model_path,
Boolean annotateTokens, Boolean annotateSentences, Boolean annotatePartOfSpeech) {
super();
// Initialize config
ConfigurationManager configManager = new ConfigurationManager_impl();
// Initialize context
this.initializeRoot(null, new ResourceManager_impl(), configManager);
// Set session
configManager.setSession(this.getSession());
// Set necessary variables
configManager.setConfigParameterValue(makeQualifiedName(PARAM_LANGUAGE), language.getName());
configManager.setConfigParameterValue(makeQualifiedName(PARAM_MODEL_PATH), hunpos_model_path);
configManager.setConfigParameterValue(makeQualifiedName(PARAM_PATH), hunpos_path);
configManager.setConfigParameterValue(makeQualifiedName(PARAM_ANNOTATE_TOKENS), annotateTokens);
configManager.setConfigParameterValue(makeQualifiedName(PARAM_ANNOTATE_POS), annotatePartOfSpeech);
configManager.setConfigParameterValue(makeQualifiedName(PARAM_ANNOTATE_SENTENCES), annotateSentences);
}
}
}