TreeTaggerWrapper.java example

Explorer

heideltime-master
- src
  - de
    - unihd
      - dbs
        heideltime
        standalone
        CLISwitch.java
        Config.java
        DocumentType.java
        HeidelTimeStandalone.java
        OutputType.java
        POSTagger.java
        components
        JCasFactory.java
        PartOfSpeechTagger.java
        ResultFormatter.java
        UIMAAnnotator.java
        impl
        AllLanguagesTokenizerWrapper.java
        HunPosTaggerWrapper.java
        IntervalTaggerWrapper.java
        JCasFactoryImpl.java
        JVnTextProWrapper.java
        StandaloneConfigContext.java
        StanfordPOSTaggerWrapper.java
        TimeMLResultFormatter.java
        TreeTaggerWrapper.java
        UimaContextImpl.java
        XMIResultFormatter.java
        exceptions
        DocumentCreationTimeMissingException.java
        uima
        annotator
        alllanguagestokenizer
        AllLanguagesTokenizer.java
        heideltime
        HeidelTime.java
        HeidelTimeException.java
        ProcessorManager.java
        processors
        DecadeProcessor.java
        GenericProcessor.java
        HolidayProcessor.java
        ProcessorInitializationException.java
        ProcessorProcessingException.java
        TemponymPostprocessing.java
        resources
        GenericResourceManager.java
        Language.java
        NormalizationManager.java
        RePatternManager.java
        RegexHashMap.java
        ResourceMap.java
        ResourceScanner.java
        RuleManager.java
        utilities
        ContextAnalyzer.java
        DateCalculator.java
        LocaleException.java
        Logger.java
        Toolbox.java
        intervaltagger
        IntervalTagger.java
        jvntextprowrapper
        JVnTextProWrapper.java
        stanfordtagger
        StanfordPOSTaggerWrapper.java
        treetagger
        TreeTaggerProcess.java
        TreeTaggerProperties.java
        TreeTaggerReader.java
        TreeTaggerTokenizer.java
        TreeTaggerWrapper.java
        TreeTaggerWriter.java
        consumer
        aceternwriter
        ACETernWriter.java
        eventi2014writer
        Eventi2014Writer.java
        tempeval2writer
        Tempeval2Writer.java
        tempeval3writer
        TempEval3Writer.java
        reader
        aceternreader
        ACETernReader.java
        eventi2014reader
        Eventi2014Reader.java
        tempeval2reader
        Tempeval2Reader.java
        tempeval3reader
        Tempeval3Reader.java
        types
        heideltime
        Dct.java
        Dct_Type.java
        Event.java
        Event_Type.java
        GoldEvent.java
        GoldEvent_Type.java
        IntervalCandidateSentence.java
        IntervalCandidateSentence_Type.java
        Sentence.java
        Sentence_Type.java
        SourceDocInfo.java
        SourceDocInfo_Type.java
        Timex3.java
        Timex3Interval.java
        Timex3Interval_Type.java
        Timex3_Type.java
        Token.java
        Token_Type.java
  - hr
    - fer
      - zemris
        takelab
        splitter
        TokenSplitter.java
        uima
        annotator
        hunpos
        HunPosAnnotationMapping.java
        HunPosAnnotionTranslator.java
        HunPosTaggerWrapper.java
  - jflexcrf
  - jmaxent
  - jvnpostag
  - jvnsegmenter
  - jvnsensegmenter
    - FeatureGenerator.java
    - JVnSenSegmenter.java
  - jvntextpro
  - jvntokenizer
    - JVnTokenizer.java
    - PennTokenizer.java

/**
 * This is a preprocessing engine for use in a UIMA pipeline. It will invoke
 * the tree-tagger binary that is supposed to be available on the system
 * through Java process access.
 */
package de.unihd.dbs.uima.annotator.treetagger;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.List;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.impl.RootUimaContext_impl;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ConfigurationManager;
import org.apache.uima.resource.impl.ConfigurationManager_impl;
import org.apache.uima.resource.impl.ResourceManager_impl;

import de.unihd.dbs.uima.annotator.heideltime.resources.Language;
import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
import de.unihd.dbs.uima.types.heideltime.Sentence;
import de.unihd.dbs.uima.types.heideltime.Token;
import de.unihd.dbs.uima.annotator.treetagger.TreeTaggerTokenizer.Flag;

/**
 * @author Andreas Fay, Julian Zell
 *
 */
public class TreeTaggerWrapper extends JCasAnnotator_ImplBase {
	private Class<?> component = this.getClass();
	
	// definitions of what names these parameters have in the wrapper's descriptor file
	public static final String PARAM_LANGUAGE = "language";
	public static final String PARAM_ANNOTATE_TOKENS = "annotate_tokens";
	public static final String PARAM_ANNOTATE_SENTENCES = "annotate_sentences";
	public static final String PARAM_ANNOTATE_PARTOFSPEECH = "annotate_partofspeech";
	public static final String PARAM_IMPROVE_GERMAN_SENTENCES = "improvegermansentences";
	public static final String PARAM_CHINESE_TOKENIZER_PATH = "ChineseTokenizerPath";
	
	// language for this instance of the treetaggerwrapper
	private Language language;
	
	// switches for annotation parameters
	private Boolean annotate_tokens = false;
	private Boolean annotate_sentences = false;
	private Boolean annotate_partofspeech = false;
	
	// local treetagger properties container, see below
	private TreeTaggerProperties ttprops = new TreeTaggerProperties();
	private TreeTaggerProcess ttProc = null;
	
	// processing threads for I/O
	private TreeTaggerWriter ttwriter;
	private TreeTaggerReader ttreader;
	
	/**
	 * uimacontext to make secondary initialize() method possible.
	 * -> programmatic, non-uima pipeline usage.
	 * @author julian
	 *
	 */
	private class TreeTaggerContext extends RootUimaContext_impl {
		private ConfigurationManager mConfigManager;
		
		// shorthand for when we don't want to supply a cnTokPath
		@SuppressWarnings("unused")
		public TreeTaggerContext(Language language, Boolean annotateTokens, Boolean annotateSentences, 
				Boolean annotatePartOfSpeech, Boolean improveGermanSentences) {
			this(language, annotateTokens, annotateSentences, annotatePartOfSpeech, 
					improveGermanSentences, null);
		}
		
		public TreeTaggerContext(Language language, Boolean annotateTokens, Boolean annotateSentences, 
				Boolean annotatePartOfSpeech, Boolean improveGermanSentences, String cnTokPath) {
			super();

			// Initialize config
			mConfigManager = new ConfigurationManager_impl();

			// Initialize context
			this.initializeRoot(null, new ResourceManager_impl(), mConfigManager);

			// Set session
			mConfigManager.setSession(this.getSession());
			
			// Set necessary variables
			mConfigManager.setConfigParameterValue(makeQualifiedName(PARAM_LANGUAGE), language.getName());
			mConfigManager.setConfigParameterValue(makeQualifiedName(PARAM_ANNOTATE_TOKENS), annotateTokens);
			mConfigManager.setConfigParameterValue(makeQualifiedName(PARAM_ANNOTATE_PARTOFSPEECH), annotatePartOfSpeech);
			mConfigManager.setConfigParameterValue(makeQualifiedName(PARAM_ANNOTATE_SENTENCES), annotateSentences);
			mConfigManager.setConfigParameterValue(makeQualifiedName(PARAM_CHINESE_TOKENIZER_PATH), cnTokPath);
		}
		
		@Override
		public ConfigurationManager getConfigurationManager() {
			return mConfigManager;
		}
	}
	
	/**
	 * secondary initialize() to use wrapper outside of a uima pipeline
	 * shorthand for when we don't want to specify a cnTokPath
	 */
	public void initialize(Language language, String treeTaggerHome, Boolean annotateTokens, 
			Boolean annotateSentences, Boolean annotatePartOfSpeech, Boolean improveGermanSentences) {
		this.initialize(language, treeTaggerHome, annotateTokens, annotateSentences, annotatePartOfSpeech,
				improveGermanSentences, null);
	}
	
	/**
	 * secondary initialize() to use wrapper outside of a uima pipeline
	 * 
	 * @param language Language/parameter file to use for the TreeTagger
	 * @param treeTaggerHome Path to the TreeTagger folder
	 * @param annotateTokens Whether to annotate tokens
	 * @param annotateSentences Whether to annotate sentences
	 * @param annotatePartOfSpeech Whether to annotate POS tags
	 * @param improveGermanSentences Whether to do improvements for german sentences
	 */
	public void initialize(Language language, String treeTaggerHome, Boolean annotateTokens, 
			Boolean annotateSentences, Boolean annotatePartOfSpeech, Boolean improveGermanSentences, String cnTokPath) {
		this.setHome(treeTaggerHome);
		
		TreeTaggerContext ttContext = new TreeTaggerContext(language, annotateTokens, 
				annotateSentences, annotatePartOfSpeech, improveGermanSentences, cnTokPath);
		
		this.initialize(ttContext); 
	}
	
	/**
	 * initialization method where we fill configuration values and check some prerequisites
	 */
	public void initialize(UimaContext aContext) {
		// check if the supplied language is one that we can currently handle
		this.language = Language.getLanguageFromString((String) aContext.getConfigParameterValue(PARAM_LANGUAGE));
		
		// get configuration from the descriptor
		annotate_tokens = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_TOKENS);
		annotate_sentences = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_SENTENCES);
		annotate_partofspeech = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_PARTOFSPEECH);
		String cnTokPath = (String) aContext.getConfigParameterValue(PARAM_CHINESE_TOKENIZER_PATH);
		
		// set some configuration based upon these values
		ttprops.languageName = language.getTreeTaggerLangName();
		if(ttprops.rootPath == null)
			ttprops.rootPath = System.getenv("TREETAGGER_HOME");
		ttprops.tokScriptName = "utf8-tokenize.perl";
		
		// parameter file
		if(!(new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.languageName + "-utf8.par").exists())) // get UTF8 version if it exists
			ttprops.parFileName = ttprops.languageName + ".par";
		else
			ttprops.parFileName = ttprops.languageName + "-utf8.par";
		
		// abbreviation file
		if(new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.languageName + "-abbreviations-utf8").exists()) { // get UTF8 version if it exists
			ttprops.abbFileName = ttprops.languageName + "-abbreviations-utf8";
		} else {
			ttprops.abbFileName = ttprops.languageName + "-abbreviations";
		}
		
		ttprops.languageSwitch = language.getTreeTaggerSwitch();
		if(cnTokPath != null && !cnTokPath.equals(""))
			ttprops.chineseTokenizerPath = new File(cnTokPath);
		else
			ttprops.chineseTokenizerPath = new File(ttprops.rootPath, "cmd");
		
		// handle the treetagger path from the environment variables
		if(ttprops.rootPath == null) {
			Logger.printError("TreeTagger environment variable is not present, aborting.");
			System.exit(-1);
		}

		// Check for whether the required treetagger parameter files are present
		Boolean abbFileFlag   = true;
		Boolean parFileFlag   = true;
		Boolean tokScriptFlag = true;
		File abbFile = new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.abbFileName);
		File parFile = new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.parFileName);
		File tokFile = new File(ttprops.rootPath+ttprops.fileSeparator+"cmd", ttprops.tokScriptName);
		if (!(abbFileFlag = abbFile.exists())) {
			if(language.equals(Language.CHINESE) || language.equals(Language.RUSSIAN)) {
				abbFileFlag = true;
				ttprops.abbFileName = null;
			} else {
				Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.abbFileName);
			}
		}
		if (!(parFileFlag = parFile.exists())) {
			Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.parFileName);
		}
		if (!(tokScriptFlag = tokFile.exists())) {
			if(language.equals(Language.CHINESE))
				tokScriptFlag = true;
			else
				Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.tokScriptName);
		}

		if (!abbFileFlag || !parFileFlag || !tokScriptFlag) {
			Logger.printError(component, "Cannot find tree tagger (" + ttprops.rootPath + ttprops.fileSeparator 
					+ "cmd" + ttprops.fileSeparator + ttprops.tokScriptName + ")." +
			" Make sure that path to tree tagger is set correctly in config.props!");
			Logger.printError(component, "If path is set correctly:");
			Logger.printError(component, "Maybe you need to download the TreeTagger tagger-scripts.tar.gz");
			Logger.printError(component, "from http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz");
			Logger.printError(component, "Extract this file and copy the missing file into the corresponding TreeTagger directories.");
			Logger.printError(component, "If missing, copy " + ttprops.abbFileName   + " into " +  ttprops.rootPath+ttprops.fileSeparator+"lib");
			Logger.printError(component, "If missing, copy " + ttprops.parFileName   + " into " +  ttprops.rootPath+ttprops.fileSeparator+"lib");
			Logger.printError(component, "If missing, copy " + ttprops.tokScriptName + " into " +  ttprops.rootPath+ttprops.fileSeparator+"cmd");
			System.exit(-1);
		}
	}
	
	/**
	 * Method that gets called to process the documents' cas objects
	 */
	public void process(JCas jcas) throws AnalysisEngineProcessException {
		// if the annotate_tokens flag is set, annotate the tokens and add them to the jcas
		if(annotate_tokens)
			if(language.equals(Language.CHINESE))
				tokenizeChinese(jcas); // chinese needs different tokenization
			else
				tokenize(jcas);

		/* if the annotate_partofspeech flag is set, annotate partofspeech and,
		 * if specified, also tag sentences based upon the partofspeech tags. 
		 */
		if(annotate_partofspeech) 
			doTreeTag(jcas);
		
		// if the improve_german_sentences flag is set, improve the sentence tokens made by the treetagger
		if(this.language == Language.GERMAN)
			improveGermanSentences(jcas);
		
		// if French, improve the sentence tokens made by the TreeTagger with settings for French
		if (this.language == Language.FRENCH)
			improveFrenchSentences(jcas);
		
	}
	
	/**
	 * tokenizes a given JCas object's document text using the treetagger program
	 * and adds the recognized tokens to the JCas object. 
	 * @param jcas JCas object supplied by the pipeline
	 */
	private void tokenize(JCas jcas) {
		// read tokenized text to add tokens to the jcas
		Logger.printDetail(component, "TreeTagger (tokenization) with: " + ttprops.abbFileName);
		
		EnumSet<Flag> flags = Flag.getSet(ttprops.languageSwitch);
		TreeTaggerTokenizer ttt; ttprops.abbFileName = "english-abbreviations";
		if(ttprops.abbFileName != null) {
			ttt = new TreeTaggerTokenizer(ttprops.rootPath + ttprops.fileSeparator + "lib" + ttprops.fileSeparator + ttprops.abbFileName, flags);
		} else {
			ttt = new TreeTaggerTokenizer(null, flags);
		}
		
		String docText = jcas.getDocumentText().replaceAll("\n\n", "\nEMPTYLINE\n");
		List<String> tokenized = ttt.tokenize(docText);
		
		int tokenOffset = 0;
		// loop through all the lines in the treetagger output
		for(String s : tokenized) {
			// charset missmatch fallback: signal (invalid) s
			if ((!(s.equals("EMPTYLINE"))) && (jcas.getDocumentText().indexOf(s, tokenOffset) < 0)) {
				Logger.printError(component, "Tokenization was interrupted because the token \"" + s 
						+ "\" could not be found in the original text. The reason for this might be "
						+ "that the encoding of the document is not UTF-8. This token was skipped and "
						+ "if it was part of a temporal expression, will not be extracted.");
				continue;
			}
			// create tokens and add them to the jcas's indexes.
			Token newToken = new Token(jcas);
			if (s.equals("EMPTYLINE")){
				newToken.setBegin(tokenOffset);
				newToken.setEnd(tokenOffset);
				newToken.setPos("EMPTYLINE");
				if (annotate_partofspeech){
					newToken.addToIndexes();
				}
			}
			else{
				newToken.setBegin(jcas.getDocumentText().indexOf(s, tokenOffset));
				newToken.setEnd(newToken.getBegin() + s.length());
				newToken.addToIndexes();
				tokenOffset = newToken.getEnd();
			}
		}
	}
	
	/**
	 * tokenizes a given JCas object's document text using the chinese tokenization
	 * script and adds the recognized tokens to the JCas object. 
	 * @param jcas JCas object supplied by the pipeline
	 */
	private void tokenizeChinese(JCas jcas) {
		try {
			// read tokenized text to add tokens to the jcas
			Process proc = ttprops.getChineseTokenizationProcess();
			Logger.printDetail(component, "Chinese tokenization: " + ttprops.chineseTokenizerPath);
			
			BufferedReader in = new BufferedReader(new InputStreamReader(proc.getInputStream(), "UTF-8"));
			BufferedWriter out = new BufferedWriter(new OutputStreamWriter(proc.getOutputStream(), "UTF-8"));
			
			Integer tokenOffset = 0;
			// loop through all the lines in the stdout output
			String[] inSplits = jcas.getDocumentText().split("[\\r\\n]+");
			for(String inSplit : inSplits) {
				out.write(inSplit);
				out.newLine();
				out.flush();
				
				// do one initial read
				String s = in.readLine();
				do {
					// break out of the loop if we've read a null
					if(s == null)
						break;
					
					String[] outSplits = s.split("\\s+");
					for(String tok : outSplits) {
						if(jcas.getDocumentText().indexOf(tok, tokenOffset) < 0)
							throw new RuntimeException("Could not find token " + tok +
									" in JCas after tokenizing with Chinese tokenization script.");
						
						// create tokens and add them to the jcas's indexes.
						Token newToken = new Token(jcas);
						newToken.setBegin(jcas.getDocumentText().indexOf(tok, tokenOffset));
						newToken.setEnd(newToken.getBegin() + tok.length());
						newToken.addToIndexes();
						tokenOffset = newToken.getEnd();
					}
					
					// break out of the loop if the next read will block
					if(!in.ready())
						break;
					
					s = in.readLine();
				} while(true);
			}
			
			// clean up
			in.close();
			proc.destroy();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	
	/**
	 * based on tokens from the jcas object, adds part of speech (POS) and sentence
	 * tags to the jcas object using the treetagger program.
	 * @param jcas JCas object supplied by the pipeline
	 */
	private void doTreeTag(JCas jcas) {
		try {
			if(ttProc == null) {
				ttProc = new TreeTaggerProcess(ttprops.getTreeTaggingProcess());
			}
			
			Logger.printDetail(component, "TreeTagger (pos tagging) with: " + ttprops.parFileName);
			
			AnnotationIndex ai = jcas.getAnnotationIndex(Token.type);
			List<String> tokenStrings = new ArrayList<>();
			List<Token> tokens = new ArrayList<>();
			for(FSIterator fsi = ai.iterator(); fsi.hasNext();) {
				Token token = (Token) fsi.next();
				tokenStrings.add(token.getCoveredText());
				tokens.add(token);
			}
			
			ttreader = new TreeTaggerReader(tokens, ttProc.getStdout(), jcas, annotate_sentences);
			ttwriter = new TreeTaggerWriter(tokenStrings, ttProc.getStdin());
			
			Thread rThread = new Thread(ttreader);
			Thread wThread = new Thread(ttwriter);
			
			rThread.start();
			wThread.start();
			
			rThread.join();
			wThread.join();
		} catch(IOException | InterruptedException e) {
			e.printStackTrace();
		}
	}

	
	/**
	 * based on tokens from the jcas object, adds part of speech (POS) and sentence
	 * tags to the jcas object using the treetagger program.
	 * @param jcas JCas object supplied by the pipeline
	 */
	@SuppressWarnings({"unused"})
	private void doTreeTagOld(JCas jcas) {
		File tmpDocument = null;
		BufferedWriter tmpFileWriter;
		ArrayList<Token> tokens = new ArrayList<Token>();
		
		try {
			// create a temporary file and write our pre-existing tokens to it.
			tmpDocument = File.createTempFile("postokens", null);
			tmpFileWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpDocument), "UTF-8"));

			// iterate over existing tokens
			FSIterator ai = jcas.getAnnotationIndex(Token.type).iterator();
			while(ai.hasNext()) {
				Token t = (Token) ai.next();
				
				tokens.add(t);
				if (!(t.getBegin() == t.getEnd())){
					tmpFileWriter.write(t.getCoveredText() + ttprops.newLineSeparator);
				}
			}
			
			tmpFileWriter.close();
		} catch(IOException e) {
			Logger.printError("Something went wrong creating a temporary file for the treetagger to process.");
			System.exit(-1);
		}

		// Possible End-of-Sentence Tags
		HashSet<String> hsEndOfSentenceTag = new HashSet<String>();
		hsEndOfSentenceTag.add("SENT");   // ENGLISH, FRENCH, GREEK, 
		hsEndOfSentenceTag.add("$.");     // GERMAN, DUTCH
		hsEndOfSentenceTag.add("FS");     // SPANISH
		hsEndOfSentenceTag.add("_Z_Fst"); // ESTONIAN
		hsEndOfSentenceTag.add("_Z_Int"); // ESTONIAN
		hsEndOfSentenceTag.add("_Z_Exc"); // ESTONIAN
		hsEndOfSentenceTag.add("ew"); // CHINESE
		
		try {
			Process p = ttprops.getTreeTaggingProcess(tmpDocument);
			Logger.printDetail(component, "TreeTagger (pos tagging) with: " + ttprops.parFileName);
				
			BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream(), "UTF-8"));
			
			Sentence sentence = null;
			// iterate over all the output lines and tokens array (which have the same source and are hence symmetric)
			int i = 0;
			String s = null;
			while ((s = in.readLine()) != null) {
				// grab a token
				Token token = tokens.get(i++);
				// modified (Aug 29, 2011): Handle empty tokens (such as empty lines) in input file
				while (token.getCoveredText().equals("")){
					// if part of the configuration, also add sentences to the jcas document
					if ((annotate_sentences) && (token.getPos() != null && token.getPos().equals("EMPTYLINE"))) {
						// Establish sentence structure
						if (sentence == null) {
							sentence = new Sentence(jcas);
							sentence.setBegin(token.getBegin());
						}
		
						// Finish current sentence if end-of-sentence pos was found or document ended
						sentence.setEnd(token.getEnd());
						if (sentence.getBegin() < sentence.getEnd()){
							sentence.addToIndexes();
						}
						
						// Make sure current sentence is not active anymore so that a new one might be created
						sentence = null;
//						sentence = new Sentence(jcas);
					}
					token.removeFromIndexes();
					token = tokens.get(i++);
				}
				// remove tokens, otherwise they are in the index twice
				token.removeFromIndexes(); 
				// set part of speech tag and add to indexes again
				if (!(token.getCoveredText().equals(""))){
					token.setPos(s);
					token.addToIndexes();
				}
				
				// if part of the configuration, also add sentences to the jcas document
				if(annotate_sentences) {
					// Establish sentence structure
					if (sentence == null) {
						sentence = new Sentence(jcas);
						sentence.setBegin(token.getBegin());
					}
	
					// Finish current sentence if end-of-sentence pos was found or document ended
					if (hsEndOfSentenceTag.contains(s) || i == tokens.size()) {
						sentence.setEnd(token.getEnd());
						sentence.addToIndexes();
						
						// Make sure current sentence is not active anymore so that a new one might be created
						sentence = null;
					}
				}
			}
			while (i < tokens.size()){
				if (!(sentence == null)){
					sentence.setEnd(tokens.get(tokens.size()-1).getEnd());
					sentence.addToIndexes();
				}
				Token token = tokens.get(i++);
				if (token.getPos() != null && token.getPos().equals("EMPTYLINE")){
					token.removeFromIndexes();
				}
			}
			in.close();
			p.destroy();
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			// Delete temporary files
			tmpDocument.delete();
		}
	}
	
	public void setHome(String home) {
		this.ttprops.rootPath = home; 
	}
	
	private void improveFrenchSentences(JCas jcas) {
		HashSet<de.unihd.dbs.uima.types.heideltime.Sentence> hsRemoveAnnotations = new HashSet<de.unihd.dbs.uima.types.heideltime.Sentence>();
		HashSet<de.unihd.dbs.uima.types.heideltime.Sentence> hsAddAnnotations    = new HashSet<de.unihd.dbs.uima.types.heideltime.Sentence>();
		
		HashSet<String> hsSentenceBeginnings = new HashSet<String>();
		hsSentenceBeginnings.add("J.-C.");
		hsSentenceBeginnings.add("J-C.");
		hsSentenceBeginnings.add("NSJC");
		
		Boolean changes = true;
		while (changes) {
			changes = false;
			FSIndex annoHeidelSentences = jcas.getAnnotationIndex(de.unihd.dbs.uima.types.heideltime.Sentence.type);
			FSIterator iterHeidelSent   = annoHeidelSentences.iterator();
			while (iterHeidelSent.hasNext()){
				de.unihd.dbs.uima.types.heideltime.Sentence s1 = (de.unihd.dbs.uima.types.heideltime.Sentence) iterHeidelSent.next();
				
				if ((s1.getCoveredText().endsWith("av.")) ||
						(s1.getCoveredText().endsWith("Av.")) ||
						(s1.getCoveredText().endsWith("apr.")) ||
						(s1.getCoveredText().endsWith("Apr.")) ||
						(s1.getCoveredText().endsWith("avant.")) ||
						(s1.getCoveredText().endsWith("Avant."))){
					if (iterHeidelSent.hasNext()){
						de.unihd.dbs.uima.types.heideltime.Sentence s2 = (de.unihd.dbs.uima.types.heideltime.Sentence) iterHeidelSent.next();
						iterHeidelSent.moveToPrevious();
						for (String beg : hsSentenceBeginnings){
							if (s2.getCoveredText().startsWith(beg)){
								de.unihd.dbs.uima.types.heideltime.Sentence s3 = new de.unihd.dbs.uima.types.heideltime.Sentence(jcas);
								s3.setBegin(s1.getBegin());
								s3.setEnd(s2.getEnd());
								hsAddAnnotations.add(s3);
								hsRemoveAnnotations.add(s1);
								hsRemoveAnnotations.add(s2);
								changes = true;
								break;
							}
						}
					}
				}
				
				
			}
			for (de.unihd.dbs.uima.types.heideltime.Sentence s : hsRemoveAnnotations){
				s.removeFromIndexes(jcas);
			}
			hsRemoveAnnotations.clear();
			for (de.unihd.dbs.uima.types.heideltime.Sentence s : hsAddAnnotations){
				s.addToIndexes(jcas);
			}
			hsAddAnnotations.clear();
		}
	}
		
	

	/**
	 * improve german sentences; the treetagger splits german sentences incorrectly on some occasions
	 * @param jcas JCas object supplied by the pipeline
	 */
	private void improveGermanSentences(JCas jcas) {
		/* 
		 * these POS tag sequences will decide whether we want to merge two sentences
		 * that have (supposedly wrongfully) been split.
		 */
		HashSet<String[]> posRules = new HashSet<String[]>();
		posRules.add(new String[] {"CARD", "\\$.", "NN"});
		posRules.add(new String[] {"CARD", "\\$.", "NE"});
		
		FSIterator sentIter = jcas.getAnnotationIndex(Sentence.type).iterator();
		
		// compare two sentences at a time in order to have access to all POS tags
		HashSet<HashSet<Sentence>> toMerge = new HashSet<HashSet<Sentence>>();
		Sentence prevSent = null, thisSent = null;
		while(sentIter.hasNext()) {
			if(thisSent == null) {
				thisSent = (Sentence) sentIter.next();
				continue;
			}
			
			prevSent = thisSent;
			thisSent = (Sentence) sentIter.next();
			/* 
			 * select the last two tokens within the previous sentence as well as the
			 * first of the current one and check for matches.
			 */
			Token penultimateToken = null, ultimateToken = null, firstToken = null;
			FSIterator tokIter = jcas.getAnnotationIndex(Token.type).subiterator(thisSent);
			if(tokIter.hasNext()) {
				firstToken = (Token) tokIter.next();
			}
			
			tokIter = jcas.getAnnotationIndex(Token.type).subiterator(prevSent);
			while(tokIter.hasNext()) {
				if(ultimateToken == null) {
					ultimateToken = (Token) tokIter.next();
					continue;
				}
				penultimateToken = ultimateToken;
				ultimateToken = (Token) tokIter.next();
			}
			
			// check that all tokens for further analysis are present. if not: skip
			if(penultimateToken == null || ultimateToken == null || firstToken == null) {
				continue;
			}
			
			// check rules, memorize sentences to be merged
			for(String[] posRule : posRules) {
				/* 
				 * either one of the pre-defined POS rules fit, or the first token's 
				 * covered text begins with lower case characters.
				 */
				if((penultimateToken.getPos() != null && penultimateToken.getPos().matches(posRule[0]) &&
						ultimateToken.getPos() != null && ultimateToken.getPos().matches(posRule[1]) &&
						firstToken.getPos() != null && firstToken.getPos().matches(posRule[2]))
						||
						(firstToken.getCoveredText().matches("^[a-z/].*"))) {
					/* 
					 * check whether one of the previous candidate pairs already 
					 * contains one of our sentences.
					 */
					Boolean candidateExisted = false;
					for(HashSet<Sentence> mergeCandidate : toMerge) {
						if(mergeCandidate.contains(thisSent) || mergeCandidate.contains(prevSent)) {
							// we add both here because sets ignore duplicates
							mergeCandidate.add(prevSent);
							mergeCandidate.add(thisSent);
							
							candidateExisted = true;
							break;
						}
					}
					
					/* 
					 * if one of the sentences was not already to be merged with another,
					 * create a new merge candidate set
					 */
					if(!candidateExisted) {
						HashSet<Sentence> newCandidate = new HashSet<Sentence>();
						newCandidate.add(prevSent);
						newCandidate.add(thisSent);
						
						toMerge.add(newCandidate);
					}
					
					break; // don't need to do the next rules; already merging.
				}
			}
		}
		
		// iterate over the previously collected merge candidates
		
		for(HashSet<Sentence> mergeCandidate : toMerge) {
			// find the earliest beginning and latest end for the set of sentences
			Integer beginIndex = Integer.MAX_VALUE, endIndex = Integer.MIN_VALUE;

			Sentence mergedSent = new Sentence(jcas);
			for(Sentence s : mergeCandidate) {
				if(s.getBegin() < beginIndex) {
					beginIndex = s.getBegin();
				}
				
				if(s.getEnd() > endIndex) {
					endIndex = s.getEnd();
				}
				
				s.removeFromIndexes();
			}
			
			// set values, add to jcas
			mergedSent.setBegin(beginIndex);
			mergedSent.setEnd(endIndex);
			mergedSent.addToIndexes();
		}
	}
	
	public void quit() {
		ttProc.close();
		ttProc = null;
	}
}