TreeTaggerWrapper.java example

Explorer

TemporalSearch-master
- common
  - src
    - main
      - java
        de
        l3s
        common
        TextConsole.java
        WikiEventsHBaseImport.java
        features
        TimeSeriesFeatures.java
        hadoop
        TimeSeriesJob.java
        TimeSeriesMapper.java
        TimeSeriesReducer.java
        autocorrelation
        CorrelationJob.java
        CorrelationMapper.java
        CorrelationReducer.java
        movingaverage
        MovingAverageJob.java
        MovingAverageMapper.java
        MovingAverageReducer.java
        NoShuffleSort_MovingAverageJob.java
        NoShuffleSort_MovingAverageMapper.java
        NoShuffleSort_MovingAverageReducer.java
        SlidingWindow.java
        hadoop
        WholeFileInputFormat.java
        WholeFileRecordReader.java
        models
        App.java
        timeseries
        CompositeKeyComparator.java
        KeyData.java
        NaturalKeyGroupingComparator.java
        NaturalKeyPartitioner.java
        Timeseries.java
        TimeseriesDataPoint.java
        TimeseriesKey.java
    - test
      - java
        de
        l3s
        common
        AppTest.java
        hbase
        test
        HBaseTest.java
- content
  - src
    - main
      - java
        de
        l3s
        content
        mapred
        ClueWeb09InputFormat.java
        WikipediaPageInputFormat.java
        WikipediaPagesBz2InputStream.java
        timex
        extracting
        ClueWeb09Timex.java
        ClueWeb09TimexWriteToHDFS.java
        WikiTimex.java
        utils
        DateUtil.java
        edu
        umd
        cloud9
        collection
        Indexable.java
        IndexableFileInputFormat.java
        XMLInputFormat.java
        XMLInputFormatOld.java
        wikipedia
        WikipediaPage.java
        language
        ArabicWikipediaPage.java
        ChineseWikipediaPage.java
        CzechWikipediaPage.java
        EnglishWikipediaPage.java
        GermanWikipediaPage.java
        SpanishWikipediaPage.java
        SwedishWikipediaPage.java
        TurkishWikipediaPage.java
        WikipediaPageFactory.java
        org
        clueweb
        clueweb09
        ClueWeb09WarcRecord.java
        mapreduce
        ClueWeb09InputFormat.java
        data
        DocVector.java
        Indexable.java
        PForDocVector.java
        TermStatistics.java
        VByteDocVector.java
        WarcTrecIdMapping.java
        wikimedia
        wikihadoop
        ByteMatcher.java
        SeekableInputStream.java
        StreamWikiDumpInputFormat.java
- heideltime
  - src
    - de
      - tudarmstadt
        ukp
        dkpro
        core
        type
        Lemma.java
        Lemma_Type.java
        POS.java
        POS_Type.java
        Sentence.java
        Sentence_Type.java
        Stem.java
        Stem_Type.java
        Token.java
        Token_Type.java
        pos
        ADJ.java
        ADJ_Type.java
        ADV.java
        ADV_Type.java
        ART.java
        ART_Type.java
        CARD.java
        CARD_Type.java
        CONJ.java
        CONJ_Type.java
        N.java
        NN.java
        NN_Type.java
        NP.java
        NP_Type.java
        N_Type.java
        O.java
        O_Type.java
        PP.java
        PP_Type.java
        PR.java
        PR_Type.java
        PUNC.java
        PUNC_Type.java
        V.java
        V_Type.java
      - unihd
        dbs
        heideltime
        standalone
        CLISwitch.java
        Config.java
        Constants.java
        DocumentType.java
        HeidelTimeAnnotator.java
        HeidelTimeStandalone.java
        OutputType.java
        POSTagger.java
        components
        JCasFactory.java
        PartOfSpeechTagger.java
        ResultFormatter.java
        UIMAAnnotator.java
        impl
        IntervalTaggerWrapper.java
        JCasFactoryImpl.java
        JVnTextProWrapper.java
        StandaloneConfigContext.java
        StanfordPOSTaggerWrapper.java
        TimeMLResultFormatter.java
        TreeTaggerWrapper.java
        UimaContextImpl.java
        XMIResultFormatter.java
        exceptions
        DocumentCreationTimeMissingException.java
        uima
        annotator
        annotationtranslator
        AnnotationTranslator.java
        heideltime
        HeidelTime.java
        HeidelTimeException.java
        ProcessorManager.java
        processors
        GenericProcessor.java
        HolidayProcessor.java
        ProcessorInitializationException.java
        ProcessorProcessingException.java
        resources
        GenericResourceManager.java
        Language.java
        NormalizationManager.java
        RePatternManager.java
        RegexHashMap.java
        RuleManager.java
        utilities
        ContextAnalyzer.java
        DateCalculator.java
        LocaleException.java
        Logger.java
        Toolbox.java
        intervaltagger
        IntervalTagger.java
        jvntextprowrapper
        JVnTextProWrapper.java
        stanfordtagger
        StanfordPOSTaggerWrapper.java
        treetagger
        TreeTaggerWrapper.java
        consumer
        aceternwriter
        ACETernWriter.java
        tempeval2writer
        Tempeval2Writer.java
        tempeval3writer
        TempEval3Writer.java
        reader
        aceternreader
        ACETernReader.java
        tempeval2reader
        Tempeval2Reader.java
        tempeval3reader
        Tempeval3Reader.java
        types
        heideltime
        Dct.java
        Dct_Type.java
        Event.java
        Event_Type.java
        GoldEvent.java
        GoldEvent_Type.java
        IntervalCandidateSentence.java
        IntervalCandidateSentence_Type.java
        Sentence.java
        Sentence_Type.java
        SourceDocInfo.java
        SourceDocInfo_Type.java
        Timex3.java
        Timex3Interval.java
        Timex3Interval_Type.java
        Timex3_Type.java
        Token.java
        Token_Type.java

/**
 * This is a preprocessing engine for use in a UIMA pipeline. It will invoke
 * the tree-tagger binary that is supposed to be available on the system
 * through Java process access.
 */
package de.unihd.dbs.uima.annotator.treetagger;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashSet;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.impl.RootUimaContext_impl;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ConfigurationManager;
import org.apache.uima.resource.impl.ConfigurationManager_impl;
import org.apache.uima.resource.impl.ResourceManager_impl;

import de.unihd.dbs.uima.annotator.heideltime.resources.Language;
import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
import de.unihd.dbs.uima.types.heideltime.Sentence;
import de.unihd.dbs.uima.types.heideltime.Token;

/**
 * @author Andreas Fay, Julian Zell
 *
 */
public class TreeTaggerWrapper extends JCasAnnotator_ImplBase {
	private Class<?> component = this.getClass();
	
	// definitions of what names these parameters have in the wrapper's descriptor file
	public static final String PARAM_LANGUAGE = "language";
	public static final String PARAM_ANNOTATE_TOKENS = "annotate_tokens";
	public static final String PARAM_ANNOTATE_SENTENCES = "annotate_sentences";
	public static final String PARAM_ANNOTATE_PARTOFSPEECH = "annotate_partofspeech";
	public static final String PARAM_IMPROVE_GERMAN_SENTENCES = "improvegermansentences";
	
	// language for this instance of the treetaggerwrapper
	private Language language;
	
	// switches for annotation parameters
	private Boolean annotate_tokens = false;
	private Boolean annotate_sentences = false;
	private Boolean annotate_partofspeech = false;
	private Boolean improve_german_sentences = false;
	
	// local treetagger properties container, see below
	private TreeTaggerProperties ttprops = new TreeTaggerProperties();
	
	/**
	 * An embedded class that contains all of the treetagger-related settings.
	 * @author Julian Zell
	 *
	 */
	private class TreeTaggerProperties {
		// treetagger language name for par files
		public String languageName = null;
		
		// absolute path of the treetagger
		public String rootPath = null;

		// Files for tokenizer and part of speech tagger (standard values)
		public String tokScriptName = null;
		public String parFileName = null;
		public String abbFileName = null;

		// english, italian, and french tagger models require additional splits (see tagger readme)
		public String languageSwitch = null;

		// perl requires(?) special hint for utf-8-encoded input/output (see http://perldoc.perl.org/perlrun.html#Command-Switches -C)
		// The input text is read in HeidelTimeStandalone.java and always translated into UTF-8,
		// i.e., switch always "-CSD"
		public String utf8Switch = "-CSD";
		
		// save System-specific separators for string generation
		public String newLineSeparator = System.getProperty("line.separator");
		public String fileSeparator = System.getProperty("file.separator");
	}
	
	/**
	 * uimacontext to make secondary initialize() method possible.
	 * -> programmatic, non-uima pipeline usage.
	 * @author julian
	 *
	 */
	private class TreeTaggerContext extends RootUimaContext_impl {
		public TreeTaggerContext(Language language, Boolean annotateTokens, Boolean annotateSentences, 
				Boolean annotatePartOfSpeech, Boolean improveGermanSentences) {
			super();

			// Initialize config
			ConfigurationManager configManager = new ConfigurationManager_impl();

			// Initialize context
			this.initializeRoot(null, new ResourceManager_impl(), configManager);

			// Set session
			configManager.setSession(this.getSession());
			
			// Set necessary variables
			configManager.setConfigParameterValue(makeQualifiedName(PARAM_LANGUAGE), language.getName());
			configManager.setConfigParameterValue(makeQualifiedName(PARAM_ANNOTATE_TOKENS), annotateTokens);
			configManager.setConfigParameterValue(makeQualifiedName(PARAM_ANNOTATE_PARTOFSPEECH), annotatePartOfSpeech);
			configManager.setConfigParameterValue(makeQualifiedName(PARAM_ANNOTATE_SENTENCES), annotateSentences);
			configManager.setConfigParameterValue(makeQualifiedName(PARAM_IMPROVE_GERMAN_SENTENCES), improveGermanSentences);
		}
	}
	
	/**
	 * secondary initialize() to use wrapper outside of a uima pipeline
	 * @param language
	 * @param treeTaggerHome
	 * @param annotateTokens
	 * @param annotateSentences
	 * @param annotatePartOfSpeech
	 * @param improveGermanSentences
	 */
	public void initialize(Language language, String treeTaggerHome, Boolean annotateTokens, 
			Boolean annotateSentences, Boolean annotatePartOfSpeech, Boolean improveGermanSentences) {
		this.setHome(treeTaggerHome);
		
		TreeTaggerContext ttContext = new TreeTaggerContext(language, annotateTokens, 
				annotateSentences, annotatePartOfSpeech, improveGermanSentences);
		
		this.initialize(ttContext); 
		
	}
	
	/**
	 * initialization method where we fill configuration values and check some prerequisites
	 */
	public void initialize(UimaContext aContext) {
		// check if the supplied language is one that we can currently handle
		this.language = Language.getLanguageFromString((String) aContext.getConfigParameterValue(PARAM_LANGUAGE));
		
		// get configuration from the descriptor
		annotate_tokens = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_TOKENS);
		annotate_sentences = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_SENTENCES);
		annotate_partofspeech = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_PARTOFSPEECH);
		improve_german_sentences = (Boolean) aContext.getConfigParameterValue(PARAM_IMPROVE_GERMAN_SENTENCES);
		
		// set some configuration based upon these values
		ttprops.languageName = language.getTreeTaggerLangName();
		if(ttprops.rootPath == null)
			ttprops.rootPath = System.getenv("TREETAGGER_HOME");
		ttprops.tokScriptName = "utf8-tokenize.perl";
		
		// parameter file
		if(!(new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.languageName + "-utf8.par").exists())) // get UTF8 version if it exists
			ttprops.parFileName = ttprops.languageName + ".par";
		else
			ttprops.parFileName = ttprops.languageName + "-utf8.par";
		
		// abbreviation file
		if(!(new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.languageName + "-abbreviations-utf8").exists())) // get UTF8 version if it exists
			ttprops.abbFileName = ttprops.languageName + "-abbreviations";
		else
			ttprops.abbFileName = ttprops.languageName + "-abbreviations-utf8";
		
		ttprops.languageSwitch = language.getTreeTaggerSwitch();
		
		// handle the treetagger path from the environment variables
		if(ttprops.rootPath == null) {
			Logger.printError("TreeTagger environment variable is not present, aborting.");
			System.exit(-1);
		}

		// Check for whether the required treetagger parameter files are present
		Boolean abbFileFlag   = true;
		Boolean parFileFlag   = true;
		Boolean tokScriptFlag = true;
		File abbFile = new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.abbFileName);
		File parFile = new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.parFileName);
		File tokFile = new File(ttprops.rootPath+ttprops.fileSeparator+"cmd", ttprops.tokScriptName);
		if (!(abbFileFlag = abbFile.exists())) {
			Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.abbFileName);
		}
		if (!(parFileFlag = parFile.exists())) {
			Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.parFileName);
		}
		if (!(tokScriptFlag = tokFile.exists())) {
			Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.tokScriptName);
		}

		if (!abbFileFlag || !parFileFlag || !tokScriptFlag) {
			Logger.printError(component, "Cannot find tree tagger (" + ttprops.rootPath + ttprops.fileSeparator 
					+ "cmd" + ttprops.fileSeparator + ttprops.tokScriptName + ")." +
			" Make sure that path to tree tagger is set correctly in config.props!");
			Logger.printError(component, "If path is set correctly:");
			Logger.printError(component, "Maybe you need to download the TreeTagger tagger-scripts.tar.gz");
			Logger.printError(component, "from http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz");
			Logger.printError(component, "Extract this file and copy the missing file into the corresponding TreeTagger directories.");
			Logger.printError(component, "If missing, copy " + ttprops.abbFileName   + " into " +  ttprops.rootPath+ttprops.fileSeparator+"lib");
			Logger.printError(component, "If missing, copy " + ttprops.parFileName   + " into " +  ttprops.rootPath+ttprops.fileSeparator+"lib");
			Logger.printError(component, "If missing, copy " + ttprops.tokScriptName + " into " +  ttprops.rootPath+ttprops.fileSeparator+"cmd");
			System.exit(-1);
		}
	}
	
	/**
	 * Method that gets called to process the documents' cas objects
	 */
	public void process(JCas jcas) throws AnalysisEngineProcessException {
		// if the annotate_tokens flag is set, annotate the tokens and add them to the jcas
		if(annotate_tokens)
			tokenize(jcas);

		/* if the annotate_partofspeech flag is set, annotate partofspeech and,
		 * if specified, also tag sentences based upon the partofspeech tags. 
		 */
		if(annotate_partofspeech) 
			doTreeTag(jcas);
		
		// if the improve_german_sentences flag is set, improve the sentence tokens made by the treetagger
		if(improve_german_sentences) 
			improveGermanSentences(jcas);
	}
	
	/**
	 * tokenizes a given JCas object's document text using the treetagger program
	 * and adds the recognized tokens to the JCas object. 
	 * @param jcas JCas object supplied by the pipeline
	 */
	private void tokenize(JCas jcas) {
		BufferedWriter tmpFileWriter = null;

		File tmpDocument = null;

		BufferedReader in = null;

		try {
			// Create temp file containing the document text
			tmpDocument = File.createTempFile("pos", null);
			tmpFileWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpDocument), "UTF-8"));
			tmpFileWriter.write(jcas.getDocumentText());
			tmpFileWriter.close();
			
			// assemble a command line for the tokenization script and execute it
			ArrayList<String> command = new ArrayList<String>();
			command.add("perl");
			if(ttprops.utf8Switch != "")
				command.add(ttprops.utf8Switch);
			command.add(ttprops.rootPath + ttprops.fileSeparator + "cmd" + ttprops.fileSeparator + ttprops.tokScriptName);
			if(ttprops.languageSwitch != "")
				command.add(ttprops.languageSwitch);
			command.add("-a");
			command.add(ttprops.rootPath + ttprops.fileSeparator + "lib" + ttprops.fileSeparator + ttprops.abbFileName);
			command.add(tmpDocument.getAbsolutePath());
			
			String[] commandStr = new String[command.size()];
			command.toArray(commandStr);
			
			Process p = Runtime.getRuntime().exec(commandStr);
			Logger.printDetail(component, "TreeTagger (tokenization) with: " + ttprops.tokScriptName + " and " + ttprops.abbFileName);
			
			// read tokenized text to add tokens to the jcas
			in = new BufferedReader(new InputStreamReader(p.getInputStream(), "UTF-8"));
			String s;
			int tokenOffset = 0;
			// loop through all the lines in the treetagger output
			while ((s = in.readLine()) != null) {
				// charset missmatch fallback: signal (invalid) s
				if (jcas.getDocumentText().indexOf(s, tokenOffset) < 0)
					throw new RuntimeException("Opps! Could not find token "+s+
							" in JCas after tokenizing with TreeTagger." +
							" Hmm, there may exist a charset missmatch!" +
							" Default encoding is " + Charset.defaultCharset().name() + 
							" and should always be UTF-8 (use -Dfile.encoding=UTF-8)." +
							" If input document is not UTF-8 use -e option to set it according to the input, additionally.");

				// create tokens and add them to the jcas's indexes.
				Token newToken = new Token(jcas);
				newToken.setBegin(jcas.getDocumentText().indexOf(s, tokenOffset));
				newToken.setEnd(newToken.getBegin() + s.length());
				newToken.addToIndexes();
				tokenOffset = newToken.getEnd();
			}
			// clean up
			in.close();
			p.destroy();
			tmpDocument.delete();
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			// I/O Housekeeping
			if (tmpFileWriter != null) {
				try {
					tmpFileWriter.close();
				} catch (IOException e) {
					e.printStackTrace();
				}

				// Delete temp files
				tmpDocument.delete();
			}

			if (in != null) {
				try {
					in.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}

	}
	
	/**
	 * based on tokens from the jcas object, adds part of speech (POS) and sentence
	 * tags to the jcas object using the treetagger program.
	 * @param jcas JCas object supplied by the pipeline
	 */
	private void doTreeTag(JCas jcas) {
		File tmpDocument = null;
		BufferedWriter tmpFileWriter;
		ArrayList<Token> tokens = new ArrayList<Token>();
		
		try {
			// create a temporary file and write our pre-existing tokens to it.
			tmpDocument = File.createTempFile("postokens", null);
			tmpFileWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpDocument), "UTF-8"));

			// iterate over existing tokens
			FSIterator ai = jcas.getAnnotationIndex(Token.type).iterator();
			while(ai.hasNext()) {
				Token t = (Token) ai.next();
				
				tokens.add(t);
				tmpFileWriter.write(t.getCoveredText() + ttprops.newLineSeparator);
			}
			
			tmpFileWriter.close();
		} catch(IOException e) {
			Logger.printError("Something went wrong creating a temporary file for the treetagger to process.");
			System.exit(-1);
		}

		// Possible End-of-Sentence Tags
		HashSet<String> hsEndOfSentenceTag = new HashSet<String>();
		hsEndOfSentenceTag.add("SENT");   // ENGLISH, FRENCH, GREEK, 
		hsEndOfSentenceTag.add("$.");     // GERMAN, DUTCH
		hsEndOfSentenceTag.add("FS");     // SPANISH
		hsEndOfSentenceTag.add("_Z_Fst"); // ESTONIAN
		hsEndOfSentenceTag.add("_Z_Int"); // ESTONIAN
		hsEndOfSentenceTag.add("_Z_Exc"); // ESTONIAN
		
		try {
			// assemble a command line based on configuration and execute the POS tagging.
			ArrayList<String> command = new ArrayList<String>();
			command.add(ttprops.rootPath + ttprops.fileSeparator + "bin" + ttprops.fileSeparator + "tree-tagger");
			command.add(ttprops.rootPath + ttprops.fileSeparator + "lib" + ttprops.fileSeparator + ttprops.parFileName);
			command.add(tmpDocument.getAbsolutePath());
			command.add("-no-unknown");
			
			String[] commandStr = new String[command.size()];
			command.toArray(commandStr);
			
			Process p = Runtime.getRuntime().exec(commandStr);
			Logger.printDetail(component, "TreeTagger (pos tagging) with: " + ttprops.parFileName);
				
			BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream(), "UTF-8"));
			
			Sentence sentence = null;
			// iterate over all the output lines and tokens array (which have the same source and are hence symmetric)
			int i = 0;
			String s = null;
			while ((s = in.readLine()) != null) {
				// grab a token
				Token token = tokens.get(i++);
				// modified (Aug 29, 2011): Handle empty tokens (such as empty lines) in input file
				while (token.getCoveredText().equals("")){
					token.setPos("");
					token.addToIndexes();
					token = tokens.get(i++);
				}
				// remove tokens, otherwise they are in the index twice
				token.removeFromIndexes(); 
				// set part of speech tag and add to indexes again
				token.setPos(s);
				token.addToIndexes();
				
				// if part of the configuration, also add sentences to the jcas document
				if(annotate_sentences) {
					// Establish sentence structure
					if (sentence == null) {
						sentence = new Sentence(jcas);
						sentence.setBegin(token.getBegin());
					}
	
					// Finish current sentence if end-of-sentence pos was found or document ended
					if (hsEndOfSentenceTag.contains(s) || i == tokens.size()) {
						sentence.setEnd(token.getEnd());
						sentence.addToIndexes();
						
						// Make sure current sentence is not active anymore so that a new one might be created
						sentence = null;
					}
				}
			}
			in.close();
			p.destroy();
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			// Delete temporary files
			tmpDocument.delete();
		}

	}
	
	public void setHome(String home) {
		this.ttprops.rootPath = home; 
	}

	/**
	 * improve german sentences; the treetagger splits german sentences incorrectly on some occasions
	 * @param jcas JCas object supplied by the pipeline
	 */
	private void improveGermanSentences(JCas jcas) {
		HashSet<String> hsSentenceBeginnings = new HashSet<String>();
		hsSentenceBeginnings.add("Januar");
		hsSentenceBeginnings.add("Februar");
		hsSentenceBeginnings.add("März");
		hsSentenceBeginnings.add("April");
		hsSentenceBeginnings.add("Mai");
		hsSentenceBeginnings.add("Juni");
		hsSentenceBeginnings.add("Juli");
		hsSentenceBeginnings.add("August");
		hsSentenceBeginnings.add("September");
		hsSentenceBeginnings.add("Oktober");
		hsSentenceBeginnings.add("November");
		hsSentenceBeginnings.add("Dezember");
		hsSentenceBeginnings.add("Jahrhundert");
		hsSentenceBeginnings.add("Jahr");
		hsSentenceBeginnings.add("Monat");
		hsSentenceBeginnings.add("Woche");
		
		HashSet<de.unihd.dbs.uima.types.heideltime.Sentence> hsRemoveAnnotations = new HashSet<de.unihd.dbs.uima.types.heideltime.Sentence>();
		HashSet<de.unihd.dbs.uima.types.heideltime.Sentence> hsAddAnnotations    = new HashSet<de.unihd.dbs.uima.types.heideltime.Sentence>();
		
		Boolean changes = true;
		while (changes) {
			changes = false;
			FSIndex annoHeidelSentences = jcas.getAnnotationIndex(de.unihd.dbs.uima.types.heideltime.Sentence.type);
			FSIterator iterHeidelSent   = annoHeidelSentences.iterator();
			while (iterHeidelSent.hasNext()){
				de.unihd.dbs.uima.types.heideltime.Sentence s1 = (de.unihd.dbs.uima.types.heideltime.Sentence) iterHeidelSent.next();
				int substringOffset = java.lang.Math.max(s1.getCoveredText().length()-4,1);
				if (s1.getCoveredText().substring(substringOffset).matches(".*[\\d]+\\.[\\s\\n]*$")){
					if (iterHeidelSent.hasNext()){
						de.unihd.dbs.uima.types.heideltime.Sentence s2 = (de.unihd.dbs.uima.types.heideltime.Sentence) iterHeidelSent.next();
						iterHeidelSent.moveToPrevious();
						for (String beg : hsSentenceBeginnings){
							if (s2.getCoveredText().startsWith(beg)){
								de.unihd.dbs.uima.types.heideltime.Sentence s3 = new de.unihd.dbs.uima.types.heideltime.Sentence(jcas);
								s3.setBegin(s1.getBegin());
								s3.setEnd(s2.getEnd());
								hsAddAnnotations.add(s3);
								hsRemoveAnnotations.add(s1);
								hsRemoveAnnotations.add(s2);
								changes = true;
								break;
							}
						}
					}
				}
			}
			for (de.unihd.dbs.uima.types.heideltime.Sentence s : hsRemoveAnnotations){
				s.removeFromIndexes(jcas);
			}
			hsRemoveAnnotations.clear();
			for (de.unihd.dbs.uima.types.heideltime.Sentence s : hsAddAnnotations){
				s.addToIndexes(jcas);
			}
			hsAddAnnotations.clear();
		}
	}
}