/** * This is a preprocessing engine for use in a UIMA pipeline. It will invoke * the tree-tagger binary that is supposed to be available on the system * through Java process access. */ package de.unihd.dbs.uima.annotator.treetagger; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.EnumSet; import java.util.HashSet; import java.util.List; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.FSIndex; import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.impl.RootUimaContext_impl; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ConfigurationManager; import org.apache.uima.resource.impl.ConfigurationManager_impl; import org.apache.uima.resource.impl.ResourceManager_impl; import de.unihd.dbs.uima.annotator.heideltime.resources.Language; import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; import de.unihd.dbs.uima.types.heideltime.Sentence; import de.unihd.dbs.uima.types.heideltime.Token; import de.unihd.dbs.uima.annotator.treetagger.TreeTaggerTokenizer.Flag; /** * @author Andreas Fay, Julian Zell * */ public class TreeTaggerWrapper extends JCasAnnotator_ImplBase { private Class<?> component = this.getClass(); // definitions of what names these parameters have in the wrapper's descriptor file public static final String PARAM_LANGUAGE = "language"; public static final String PARAM_ANNOTATE_TOKENS = "annotate_tokens"; public static final String PARAM_ANNOTATE_SENTENCES = "annotate_sentences"; public static final String PARAM_ANNOTATE_PARTOFSPEECH = "annotate_partofspeech"; public static final String PARAM_IMPROVE_GERMAN_SENTENCES = "improvegermansentences"; public static final String PARAM_CHINESE_TOKENIZER_PATH = "ChineseTokenizerPath"; // language for this instance of the treetaggerwrapper private Language language; // switches for annotation parameters private Boolean annotate_tokens = false; private Boolean annotate_sentences = false; private Boolean annotate_partofspeech = false; // local treetagger properties container, see below private TreeTaggerProperties ttprops = new TreeTaggerProperties(); private TreeTaggerProcess ttProc = null; // processing threads for I/O private TreeTaggerWriter ttwriter; private TreeTaggerReader ttreader; /** * uimacontext to make secondary initialize() method possible. * -> programmatic, non-uima pipeline usage. * @author julian * */ private class TreeTaggerContext extends RootUimaContext_impl { private ConfigurationManager mConfigManager; // shorthand for when we don't want to supply a cnTokPath @SuppressWarnings("unused") public TreeTaggerContext(Language language, Boolean annotateTokens, Boolean annotateSentences, Boolean annotatePartOfSpeech, Boolean improveGermanSentences) { this(language, annotateTokens, annotateSentences, annotatePartOfSpeech, improveGermanSentences, null); } public TreeTaggerContext(Language language, Boolean annotateTokens, Boolean annotateSentences, Boolean annotatePartOfSpeech, Boolean improveGermanSentences, String cnTokPath) { super(); // Initialize config mConfigManager = new ConfigurationManager_impl(); // Initialize context this.initializeRoot(null, new ResourceManager_impl(), mConfigManager); // Set session mConfigManager.setSession(this.getSession()); // Set necessary variables mConfigManager.setConfigParameterValue(makeQualifiedName(PARAM_LANGUAGE), language.getName()); mConfigManager.setConfigParameterValue(makeQualifiedName(PARAM_ANNOTATE_TOKENS), annotateTokens); mConfigManager.setConfigParameterValue(makeQualifiedName(PARAM_ANNOTATE_PARTOFSPEECH), annotatePartOfSpeech); mConfigManager.setConfigParameterValue(makeQualifiedName(PARAM_ANNOTATE_SENTENCES), annotateSentences); mConfigManager.setConfigParameterValue(makeQualifiedName(PARAM_CHINESE_TOKENIZER_PATH), cnTokPath); } @Override public ConfigurationManager getConfigurationManager() { return mConfigManager; } } /** * secondary initialize() to use wrapper outside of a uima pipeline * shorthand for when we don't want to specify a cnTokPath */ public void initialize(Language language, String treeTaggerHome, Boolean annotateTokens, Boolean annotateSentences, Boolean annotatePartOfSpeech, Boolean improveGermanSentences) { this.initialize(language, treeTaggerHome, annotateTokens, annotateSentences, annotatePartOfSpeech, improveGermanSentences, null); } /** * secondary initialize() to use wrapper outside of a uima pipeline * * @param language Language/parameter file to use for the TreeTagger * @param treeTaggerHome Path to the TreeTagger folder * @param annotateTokens Whether to annotate tokens * @param annotateSentences Whether to annotate sentences * @param annotatePartOfSpeech Whether to annotate POS tags * @param improveGermanSentences Whether to do improvements for german sentences */ public void initialize(Language language, String treeTaggerHome, Boolean annotateTokens, Boolean annotateSentences, Boolean annotatePartOfSpeech, Boolean improveGermanSentences, String cnTokPath) { this.setHome(treeTaggerHome); TreeTaggerContext ttContext = new TreeTaggerContext(language, annotateTokens, annotateSentences, annotatePartOfSpeech, improveGermanSentences, cnTokPath); this.initialize(ttContext); } /** * initialization method where we fill configuration values and check some prerequisites */ public void initialize(UimaContext aContext) { // check if the supplied language is one that we can currently handle this.language = Language.getLanguageFromString((String) aContext.getConfigParameterValue(PARAM_LANGUAGE)); // get configuration from the descriptor annotate_tokens = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_TOKENS); annotate_sentences = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_SENTENCES); annotate_partofspeech = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_PARTOFSPEECH); String cnTokPath = (String) aContext.getConfigParameterValue(PARAM_CHINESE_TOKENIZER_PATH); // set some configuration based upon these values ttprops.languageName = language.getTreeTaggerLangName(); if(ttprops.rootPath == null) ttprops.rootPath = System.getenv("TREETAGGER_HOME"); ttprops.tokScriptName = "utf8-tokenize.perl"; // parameter file if(!(new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.languageName + "-utf8.par").exists())) // get UTF8 version if it exists ttprops.parFileName = ttprops.languageName + ".par"; else ttprops.parFileName = ttprops.languageName + "-utf8.par"; // abbreviation file if(new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.languageName + "-abbreviations-utf8").exists()) { // get UTF8 version if it exists ttprops.abbFileName = ttprops.languageName + "-abbreviations-utf8"; } else { ttprops.abbFileName = ttprops.languageName + "-abbreviations"; } ttprops.languageSwitch = language.getTreeTaggerSwitch(); if(cnTokPath != null && !cnTokPath.equals("")) ttprops.chineseTokenizerPath = new File(cnTokPath); else ttprops.chineseTokenizerPath = new File(ttprops.rootPath, "cmd"); // handle the treetagger path from the environment variables if(ttprops.rootPath == null) { Logger.printError("TreeTagger environment variable is not present, aborting."); System.exit(-1); } // Check for whether the required treetagger parameter files are present Boolean abbFileFlag = true; Boolean parFileFlag = true; Boolean tokScriptFlag = true; File abbFile = new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.abbFileName); File parFile = new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.parFileName); File tokFile = new File(ttprops.rootPath+ttprops.fileSeparator+"cmd", ttprops.tokScriptName); if (!(abbFileFlag = abbFile.exists())) { if(language.equals(Language.CHINESE) || language.equals(Language.RUSSIAN)) { abbFileFlag = true; ttprops.abbFileName = null; } else { Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.abbFileName); } } if (!(parFileFlag = parFile.exists())) { Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.parFileName); } if (!(tokScriptFlag = tokFile.exists())) { if(language.equals(Language.CHINESE)) tokScriptFlag = true; else Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.tokScriptName); } if (!abbFileFlag || !parFileFlag || !tokScriptFlag) { Logger.printError(component, "Cannot find tree tagger (" + ttprops.rootPath + ttprops.fileSeparator + "cmd" + ttprops.fileSeparator + ttprops.tokScriptName + ")." + " Make sure that path to tree tagger is set correctly in config.props!"); Logger.printError(component, "If path is set correctly:"); Logger.printError(component, "Maybe you need to download the TreeTagger tagger-scripts.tar.gz"); Logger.printError(component, "from http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz"); Logger.printError(component, "Extract this file and copy the missing file into the corresponding TreeTagger directories."); Logger.printError(component, "If missing, copy " + ttprops.abbFileName + " into " + ttprops.rootPath+ttprops.fileSeparator+"lib"); Logger.printError(component, "If missing, copy " + ttprops.parFileName + " into " + ttprops.rootPath+ttprops.fileSeparator+"lib"); Logger.printError(component, "If missing, copy " + ttprops.tokScriptName + " into " + ttprops.rootPath+ttprops.fileSeparator+"cmd"); System.exit(-1); } } /** * Method that gets called to process the documents' cas objects */ public void process(JCas jcas) throws AnalysisEngineProcessException { // if the annotate_tokens flag is set, annotate the tokens and add them to the jcas if(annotate_tokens) if(language.equals(Language.CHINESE)) tokenizeChinese(jcas); // chinese needs different tokenization else tokenize(jcas); /* if the annotate_partofspeech flag is set, annotate partofspeech and, * if specified, also tag sentences based upon the partofspeech tags. */ if(annotate_partofspeech) doTreeTag(jcas); // if the improve_german_sentences flag is set, improve the sentence tokens made by the treetagger if(this.language == Language.GERMAN) improveGermanSentences(jcas); // if French, improve the sentence tokens made by the TreeTagger with settings for French if (this.language == Language.FRENCH) improveFrenchSentences(jcas); } /** * tokenizes a given JCas object's document text using the treetagger program * and adds the recognized tokens to the JCas object. * @param jcas JCas object supplied by the pipeline */ private void tokenize(JCas jcas) { // read tokenized text to add tokens to the jcas Logger.printDetail(component, "TreeTagger (tokenization) with: " + ttprops.abbFileName); EnumSet<Flag> flags = Flag.getSet(ttprops.languageSwitch); TreeTaggerTokenizer ttt; ttprops.abbFileName = "english-abbreviations"; if(ttprops.abbFileName != null) { ttt = new TreeTaggerTokenizer(ttprops.rootPath + ttprops.fileSeparator + "lib" + ttprops.fileSeparator + ttprops.abbFileName, flags); } else { ttt = new TreeTaggerTokenizer(null, flags); } String docText = jcas.getDocumentText().replaceAll("\n\n", "\nEMPTYLINE\n"); List<String> tokenized = ttt.tokenize(docText); int tokenOffset = 0; // loop through all the lines in the treetagger output for(String s : tokenized) { // charset missmatch fallback: signal (invalid) s if ((!(s.equals("EMPTYLINE"))) && (jcas.getDocumentText().indexOf(s, tokenOffset) < 0)) { Logger.printError(component, "Tokenization was interrupted because the token \"" + s + "\" could not be found in the original text. The reason for this might be " + "that the encoding of the document is not UTF-8. This token was skipped and " + "if it was part of a temporal expression, will not be extracted."); continue; } // create tokens and add them to the jcas's indexes. Token newToken = new Token(jcas); if (s.equals("EMPTYLINE")){ newToken.setBegin(tokenOffset); newToken.setEnd(tokenOffset); newToken.setPos("EMPTYLINE"); if (annotate_partofspeech){ newToken.addToIndexes(); } } else{ newToken.setBegin(jcas.getDocumentText().indexOf(s, tokenOffset)); newToken.setEnd(newToken.getBegin() + s.length()); newToken.addToIndexes(); tokenOffset = newToken.getEnd(); } } } /** * tokenizes a given JCas object's document text using the chinese tokenization * script and adds the recognized tokens to the JCas object. * @param jcas JCas object supplied by the pipeline */ private void tokenizeChinese(JCas jcas) { try { // read tokenized text to add tokens to the jcas Process proc = ttprops.getChineseTokenizationProcess(); Logger.printDetail(component, "Chinese tokenization: " + ttprops.chineseTokenizerPath); BufferedReader in = new BufferedReader(new InputStreamReader(proc.getInputStream(), "UTF-8")); BufferedWriter out = new BufferedWriter(new OutputStreamWriter(proc.getOutputStream(), "UTF-8")); Integer tokenOffset = 0; // loop through all the lines in the stdout output String[] inSplits = jcas.getDocumentText().split("[\\r\\n]+"); for(String inSplit : inSplits) { out.write(inSplit); out.newLine(); out.flush(); // do one initial read String s = in.readLine(); do { // break out of the loop if we've read a null if(s == null) break; String[] outSplits = s.split("\\s+"); for(String tok : outSplits) { if(jcas.getDocumentText().indexOf(tok, tokenOffset) < 0) throw new RuntimeException("Could not find token " + tok + " in JCas after tokenizing with Chinese tokenization script."); // create tokens and add them to the jcas's indexes. Token newToken = new Token(jcas); newToken.setBegin(jcas.getDocumentText().indexOf(tok, tokenOffset)); newToken.setEnd(newToken.getBegin() + tok.length()); newToken.addToIndexes(); tokenOffset = newToken.getEnd(); } // break out of the loop if the next read will block if(!in.ready()) break; s = in.readLine(); } while(true); } // clean up in.close(); proc.destroy(); } catch (Exception e) { e.printStackTrace(); } } /** * based on tokens from the jcas object, adds part of speech (POS) and sentence * tags to the jcas object using the treetagger program. * @param jcas JCas object supplied by the pipeline */ private void doTreeTag(JCas jcas) { try { if(ttProc == null) { ttProc = new TreeTaggerProcess(ttprops.getTreeTaggingProcess()); } Logger.printDetail(component, "TreeTagger (pos tagging) with: " + ttprops.parFileName); AnnotationIndex ai = jcas.getAnnotationIndex(Token.type); List<String> tokenStrings = new ArrayList<>(); List<Token> tokens = new ArrayList<>(); for(FSIterator fsi = ai.iterator(); fsi.hasNext();) { Token token = (Token) fsi.next(); tokenStrings.add(token.getCoveredText()); tokens.add(token); } ttreader = new TreeTaggerReader(tokens, ttProc.getStdout(), jcas, annotate_sentences); ttwriter = new TreeTaggerWriter(tokenStrings, ttProc.getStdin()); Thread rThread = new Thread(ttreader); Thread wThread = new Thread(ttwriter); rThread.start(); wThread.start(); rThread.join(); wThread.join(); } catch(IOException | InterruptedException e) { e.printStackTrace(); } } /** * based on tokens from the jcas object, adds part of speech (POS) and sentence * tags to the jcas object using the treetagger program. * @param jcas JCas object supplied by the pipeline */ @SuppressWarnings({"unused"}) private void doTreeTagOld(JCas jcas) { File tmpDocument = null; BufferedWriter tmpFileWriter; ArrayList<Token> tokens = new ArrayList<Token>(); try { // create a temporary file and write our pre-existing tokens to it. tmpDocument = File.createTempFile("postokens", null); tmpFileWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpDocument), "UTF-8")); // iterate over existing tokens FSIterator ai = jcas.getAnnotationIndex(Token.type).iterator(); while(ai.hasNext()) { Token t = (Token) ai.next(); tokens.add(t); if (!(t.getBegin() == t.getEnd())){ tmpFileWriter.write(t.getCoveredText() + ttprops.newLineSeparator); } } tmpFileWriter.close(); } catch(IOException e) { Logger.printError("Something went wrong creating a temporary file for the treetagger to process."); System.exit(-1); } // Possible End-of-Sentence Tags HashSet<String> hsEndOfSentenceTag = new HashSet<String>(); hsEndOfSentenceTag.add("SENT"); // ENGLISH, FRENCH, GREEK, hsEndOfSentenceTag.add("$."); // GERMAN, DUTCH hsEndOfSentenceTag.add("FS"); // SPANISH hsEndOfSentenceTag.add("_Z_Fst"); // ESTONIAN hsEndOfSentenceTag.add("_Z_Int"); // ESTONIAN hsEndOfSentenceTag.add("_Z_Exc"); // ESTONIAN hsEndOfSentenceTag.add("ew"); // CHINESE try { Process p = ttprops.getTreeTaggingProcess(tmpDocument); Logger.printDetail(component, "TreeTagger (pos tagging) with: " + ttprops.parFileName); BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream(), "UTF-8")); Sentence sentence = null; // iterate over all the output lines and tokens array (which have the same source and are hence symmetric) int i = 0; String s = null; while ((s = in.readLine()) != null) { // grab a token Token token = tokens.get(i++); // modified (Aug 29, 2011): Handle empty tokens (such as empty lines) in input file while (token.getCoveredText().equals("")){ // if part of the configuration, also add sentences to the jcas document if ((annotate_sentences) && (token.getPos() != null && token.getPos().equals("EMPTYLINE"))) { // Establish sentence structure if (sentence == null) { sentence = new Sentence(jcas); sentence.setBegin(token.getBegin()); } // Finish current sentence if end-of-sentence pos was found or document ended sentence.setEnd(token.getEnd()); if (sentence.getBegin() < sentence.getEnd()){ sentence.addToIndexes(); } // Make sure current sentence is not active anymore so that a new one might be created sentence = null; // sentence = new Sentence(jcas); } token.removeFromIndexes(); token = tokens.get(i++); } // remove tokens, otherwise they are in the index twice token.removeFromIndexes(); // set part of speech tag and add to indexes again if (!(token.getCoveredText().equals(""))){ token.setPos(s); token.addToIndexes(); } // if part of the configuration, also add sentences to the jcas document if(annotate_sentences) { // Establish sentence structure if (sentence == null) { sentence = new Sentence(jcas); sentence.setBegin(token.getBegin()); } // Finish current sentence if end-of-sentence pos was found or document ended if (hsEndOfSentenceTag.contains(s) || i == tokens.size()) { sentence.setEnd(token.getEnd()); sentence.addToIndexes(); // Make sure current sentence is not active anymore so that a new one might be created sentence = null; } } } while (i < tokens.size()){ if (!(sentence == null)){ sentence.setEnd(tokens.get(tokens.size()-1).getEnd()); sentence.addToIndexes(); } Token token = tokens.get(i++); if (token.getPos() != null && token.getPos().equals("EMPTYLINE")){ token.removeFromIndexes(); } } in.close(); p.destroy(); } catch (Exception e) { e.printStackTrace(); } finally { // Delete temporary files tmpDocument.delete(); } } public void setHome(String home) { this.ttprops.rootPath = home; } private void improveFrenchSentences(JCas jcas) { HashSet<de.unihd.dbs.uima.types.heideltime.Sentence> hsRemoveAnnotations = new HashSet<de.unihd.dbs.uima.types.heideltime.Sentence>(); HashSet<de.unihd.dbs.uima.types.heideltime.Sentence> hsAddAnnotations = new HashSet<de.unihd.dbs.uima.types.heideltime.Sentence>(); HashSet<String> hsSentenceBeginnings = new HashSet<String>(); hsSentenceBeginnings.add("J.-C."); hsSentenceBeginnings.add("J-C."); hsSentenceBeginnings.add("NSJC"); Boolean changes = true; while (changes) { changes = false; FSIndex annoHeidelSentences = jcas.getAnnotationIndex(de.unihd.dbs.uima.types.heideltime.Sentence.type); FSIterator iterHeidelSent = annoHeidelSentences.iterator(); while (iterHeidelSent.hasNext()){ de.unihd.dbs.uima.types.heideltime.Sentence s1 = (de.unihd.dbs.uima.types.heideltime.Sentence) iterHeidelSent.next(); if ((s1.getCoveredText().endsWith("av.")) || (s1.getCoveredText().endsWith("Av.")) || (s1.getCoveredText().endsWith("apr.")) || (s1.getCoveredText().endsWith("Apr.")) || (s1.getCoveredText().endsWith("avant.")) || (s1.getCoveredText().endsWith("Avant."))){ if (iterHeidelSent.hasNext()){ de.unihd.dbs.uima.types.heideltime.Sentence s2 = (de.unihd.dbs.uima.types.heideltime.Sentence) iterHeidelSent.next(); iterHeidelSent.moveToPrevious(); for (String beg : hsSentenceBeginnings){ if (s2.getCoveredText().startsWith(beg)){ de.unihd.dbs.uima.types.heideltime.Sentence s3 = new de.unihd.dbs.uima.types.heideltime.Sentence(jcas); s3.setBegin(s1.getBegin()); s3.setEnd(s2.getEnd()); hsAddAnnotations.add(s3); hsRemoveAnnotations.add(s1); hsRemoveAnnotations.add(s2); changes = true; break; } } } } } for (de.unihd.dbs.uima.types.heideltime.Sentence s : hsRemoveAnnotations){ s.removeFromIndexes(jcas); } hsRemoveAnnotations.clear(); for (de.unihd.dbs.uima.types.heideltime.Sentence s : hsAddAnnotations){ s.addToIndexes(jcas); } hsAddAnnotations.clear(); } } /** * improve german sentences; the treetagger splits german sentences incorrectly on some occasions * @param jcas JCas object supplied by the pipeline */ private void improveGermanSentences(JCas jcas) { /* * these POS tag sequences will decide whether we want to merge two sentences * that have (supposedly wrongfully) been split. */ HashSet<String[]> posRules = new HashSet<String[]>(); posRules.add(new String[] {"CARD", "\\$.", "NN"}); posRules.add(new String[] {"CARD", "\\$.", "NE"}); FSIterator sentIter = jcas.getAnnotationIndex(Sentence.type).iterator(); // compare two sentences at a time in order to have access to all POS tags HashSet<HashSet<Sentence>> toMerge = new HashSet<HashSet<Sentence>>(); Sentence prevSent = null, thisSent = null; while(sentIter.hasNext()) { if(thisSent == null) { thisSent = (Sentence) sentIter.next(); continue; } prevSent = thisSent; thisSent = (Sentence) sentIter.next(); /* * select the last two tokens within the previous sentence as well as the * first of the current one and check for matches. */ Token penultimateToken = null, ultimateToken = null, firstToken = null; FSIterator tokIter = jcas.getAnnotationIndex(Token.type).subiterator(thisSent); if(tokIter.hasNext()) { firstToken = (Token) tokIter.next(); } tokIter = jcas.getAnnotationIndex(Token.type).subiterator(prevSent); while(tokIter.hasNext()) { if(ultimateToken == null) { ultimateToken = (Token) tokIter.next(); continue; } penultimateToken = ultimateToken; ultimateToken = (Token) tokIter.next(); } // check that all tokens for further analysis are present. if not: skip if(penultimateToken == null || ultimateToken == null || firstToken == null) { continue; } // check rules, memorize sentences to be merged for(String[] posRule : posRules) { /* * either one of the pre-defined POS rules fit, or the first token's * covered text begins with lower case characters. */ if((penultimateToken.getPos() != null && penultimateToken.getPos().matches(posRule[0]) && ultimateToken.getPos() != null && ultimateToken.getPos().matches(posRule[1]) && firstToken.getPos() != null && firstToken.getPos().matches(posRule[2])) || (firstToken.getCoveredText().matches("^[a-z/].*"))) { /* * check whether one of the previous candidate pairs already * contains one of our sentences. */ Boolean candidateExisted = false; for(HashSet<Sentence> mergeCandidate : toMerge) { if(mergeCandidate.contains(thisSent) || mergeCandidate.contains(prevSent)) { // we add both here because sets ignore duplicates mergeCandidate.add(prevSent); mergeCandidate.add(thisSent); candidateExisted = true; break; } } /* * if one of the sentences was not already to be merged with another, * create a new merge candidate set */ if(!candidateExisted) { HashSet<Sentence> newCandidate = new HashSet<Sentence>(); newCandidate.add(prevSent); newCandidate.add(thisSent); toMerge.add(newCandidate); } break; // don't need to do the next rules; already merging. } } } // iterate over the previously collected merge candidates for(HashSet<Sentence> mergeCandidate : toMerge) { // find the earliest beginning and latest end for the set of sentences Integer beginIndex = Integer.MAX_VALUE, endIndex = Integer.MIN_VALUE; Sentence mergedSent = new Sentence(jcas); for(Sentence s : mergeCandidate) { if(s.getBegin() < beginIndex) { beginIndex = s.getBegin(); } if(s.getEnd() > endIndex) { endIndex = s.getEnd(); } s.removeFromIndexes(); } // set values, add to jcas mergedSent.setBegin(beginIndex); mergedSent.setEnd(endIndex); mergedSent.addToIndexes(); } } public void quit() { ttProc.close(); ttProc = null; } }