/* * AnnotationTranslator.java * * Copyright (c) 2011, Database Research Group, Institute of Computer Science, University of Heidelberg. * All rights reserved. This program and the accompanying materials * are made available under the terms of the GNU General Public License. * * author: Jannik Strötgen * email: stroetgen@uni-hd.de * * Annotation Translator translates annotations of one type system into another. * For details, see http://dbs.ifi.uni-heidelberg.de/heideltime */ package de.unihd.dbs.uima.annotator.annotationtranslator; import java.util.HashMap; import java.util.HashSet; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.cas.FSIndex; import org.apache.uima.cas.FSIterator; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; /** * Add an additional annotation type for an existing annotation * @author jannik * */ public class AnnotationTranslator extends JCasAnnotator_ImplBase { @SuppressWarnings("unused") private String toolname = "de.unihd.dbs.uima.annotator.annotationtranslator"; public static final String PARAM_DKPRO_TO_HEIDELTIME = "DkproToHeideltime"; public static final String PARAM_HEIDELTIME_TO_DKPRO = "HeideltimeToDkpro"; public static final String PARAM_IMPROVE_SENTENCE_DE = "ImproveGermanSentences"; public Boolean dkproToHeidel = true; public Boolean heidelToDpkro = true; public Boolean improveSentDe = true; public HashSet<String> hsSentenceBeginnings; /** * @see AnalysisComponent#initialize(UimaContext) */ public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); heidelToDpkro = (Boolean) aContext.getConfigParameterValue(PARAM_HEIDELTIME_TO_DKPRO); dkproToHeidel = (Boolean) aContext.getConfigParameterValue(PARAM_DKPRO_TO_HEIDELTIME); improveSentDe = (Boolean) aContext.getConfigParameterValue(PARAM_IMPROVE_SENTENCE_DE); hsSentenceBeginnings = new HashSet<String>(); hsSentenceBeginnings.add("Januar"); hsSentenceBeginnings.add("Februar"); hsSentenceBeginnings.add("März"); hsSentenceBeginnings.add("April"); hsSentenceBeginnings.add("Mai"); hsSentenceBeginnings.add("Juni"); hsSentenceBeginnings.add("Juli"); hsSentenceBeginnings.add("August"); hsSentenceBeginnings.add("September"); hsSentenceBeginnings.add("Oktober"); hsSentenceBeginnings.add("November"); hsSentenceBeginnings.add("Dezember"); hsSentenceBeginnings.add("Jahrhundert"); hsSentenceBeginnings.add("Jahr"); hsSentenceBeginnings.add("Monat"); hsSentenceBeginnings.add("Woche"); } /** * @see JCasAnnotator_ImplBase#process(JCas) */ public void process(JCas jcas) { if (heidelToDpkro){ // translate the HeidelTime sentences and tokens to DKPro Tagset FSIndex annoSentHeidel = jcas.getAnnotationIndex(de.unihd.dbs.uima.types.heideltime.Sentence.type); FSIterator iterSentHeidel = annoSentHeidel.iterator(); FSIndex annoTokHeidel = jcas.getAnnotationIndex(de.unihd.dbs.uima.types.heideltime.Token.type); FSIterator iterTokHeidel = annoTokHeidel.iterator(); // create DKPro sentences from HeidelTime sentences HashSet<de.unihd.dbs.uima.types.heideltime.Sentence> hsRemoveHeidelSent = new HashSet<de.unihd.dbs.uima.types.heideltime.Sentence>(); while (iterSentHeidel.hasNext()){ de.unihd.dbs.uima.types.heideltime.Sentence s1 = (de.unihd.dbs.uima.types.heideltime.Sentence) iterSentHeidel.next(); de.tudarmstadt.ukp.dkpro.core.type.Sentence s2 = new de.tudarmstadt.ukp.dkpro.core.type.Sentence(jcas); s2.setBegin(s1.getBegin()); s2.setEnd(s1.getEnd()); s2.addToIndexes(); hsRemoveHeidelSent.add(s1); } // create DKPro tokens from HeidelTime tokens HashSet<de.unihd.dbs.uima.types.heideltime.Token> hsRemoveHeidelTok = new HashSet<de.unihd.dbs.uima.types.heideltime.Token>(); while (iterTokHeidel.hasNext()){ de.unihd.dbs.uima.types.heideltime.Token t1 = (de.unihd.dbs.uima.types.heideltime.Token) iterTokHeidel.next(); de.tudarmstadt.ukp.dkpro.core.type.Token t2 = new de.tudarmstadt.ukp.dkpro.core.type.Token(jcas); t2.setBegin(t1.getBegin()); t2.setEnd(t1.getEnd()); t2.addToIndexes(); hsRemoveHeidelTok.add(t1); } } if (dkproToHeidel){ // translate the DKPro sentences, tokens, and pos (with all kind of names) to HeidelTime FSIndex annoSentDkpro = jcas.getAnnotationIndex(de.tudarmstadt.ukp.dkpro.core.type.Sentence.type); FSIterator iterSentDkpro = annoSentDkpro.iterator(); // get all the HeidelTime sentences, token if they are already available FSIndex annoSentHeidel = jcas.getAnnotationIndex(de.unihd.dbs.uima.types.heideltime.Sentence.type); FSIndex annoTokHeidel = jcas.getAnnotationIndex(de.unihd.dbs.uima.types.heideltime.Token.type); FSIterator iterSentHeidel = annoSentHeidel.iterator(); FSIterator iterTokHeidel = annoTokHeidel.iterator(); HashMap<String, de.unihd.dbs.uima.types.heideltime.Sentence> hmOldSent = new HashMap<String, de.unihd.dbs.uima.types.heideltime.Sentence>(); HashMap<String, de.unihd.dbs.uima.types.heideltime.Token> hmOldTok = new HashMap<String, de.unihd.dbs.uima.types.heideltime.Token>(); while (iterSentHeidel.hasNext()){ de.unihd.dbs.uima.types.heideltime.Sentence s = (de.unihd.dbs.uima.types.heideltime.Sentence) iterSentHeidel.next(); hmOldSent.put(s.getBegin()+"-"+s.getEnd(), s); } while (iterTokHeidel.hasNext()){ de.unihd.dbs.uima.types.heideltime.Token t = (de.unihd.dbs.uima.types.heideltime.Token) iterTokHeidel.next(); hmOldTok.put(t.getBegin()+"-"+t.getEnd(), t); } // create HeidelTime sentences from DKPro sentences HashSet<de.tudarmstadt.ukp.dkpro.core.type.Sentence> hsRemoveDkproSent = new HashSet<de.tudarmstadt.ukp.dkpro.core.type.Sentence>(); HashSet<de.tudarmstadt.ukp.dkpro.core.type.Token> hsRemoveDkproTok = new HashSet<de.tudarmstadt.ukp.dkpro.core.type.Token>(); while (iterSentDkpro.hasNext()){ de.tudarmstadt.ukp.dkpro.core.type.Sentence s1 = (de.tudarmstadt.ukp.dkpro.core.type.Sentence) iterSentDkpro.next(); de.unihd.dbs.uima.types.heideltime.Sentence s2 = new de.unihd.dbs.uima.types.heideltime.Sentence(jcas); if (hmOldSent.containsKey(s1.getBegin()+"-"+s1.getEnd())){ s2 = hmOldSent.get(s1.getBegin()+"-"+s1.getEnd()); s2.removeFromIndexes(); } s2.setBegin(s1.getBegin()); s2.setEnd(s1.getEnd()); s2.addToIndexes(); hsRemoveDkproSent.add(s1); FSIterator iterTokDkpro = jcas.getAnnotationIndex(de.tudarmstadt.ukp.dkpro.core.type.Token.type).subiterator(s1); // create HeidelTime tokens (with POS information) from DKPro tokens and DKPro Pos information while (iterTokDkpro.hasNext()){ de.tudarmstadt.ukp.dkpro.core.type.Token t1 = (de.tudarmstadt.ukp.dkpro.core.type.Token) iterTokDkpro.next(); de.unihd.dbs.uima.types.heideltime.Token t2 = new de.unihd.dbs.uima.types.heideltime.Token(jcas); if (hmOldTok.containsKey(t1.getBegin()+"-"+t1.getEnd())){ t2 = hmOldTok.get(t1.getBegin()+"-"+t1.getEnd()); t2.removeFromIndexes(); } t2.setBegin(t1.getBegin()); t2.setEnd(t1.getEnd()); // ADD POS TAGS! FSIterator iterPosAdj = jcas.getAnnotationIndex(de.tudarmstadt.ukp.dkpro.core.type.pos.ADJ.type).subiterator(s1); FSIterator iterPosAdv = jcas.getAnnotationIndex(de.tudarmstadt.ukp.dkpro.core.type.pos.ADV.type).subiterator(s1); FSIterator iterPosArt = jcas.getAnnotationIndex(de.tudarmstadt.ukp.dkpro.core.type.pos.ART.type).subiterator(s1); FSIterator iterPosCard = jcas.getAnnotationIndex(de.tudarmstadt.ukp.dkpro.core.type.pos.CARD.type).subiterator(s1); FSIterator iterPosConj = jcas.getAnnotationIndex(de.tudarmstadt.ukp.dkpro.core.type.pos.CONJ.type).subiterator(s1); FSIterator iterPosNn = jcas.getAnnotationIndex(de.tudarmstadt.ukp.dkpro.core.type.pos.NN.type).subiterator(s1); FSIterator iterPosNp = jcas.getAnnotationIndex(de.tudarmstadt.ukp.dkpro.core.type.pos.NP.type).subiterator(s1); FSIterator iterPosO = jcas.getAnnotationIndex(de.tudarmstadt.ukp.dkpro.core.type.pos.O.type).subiterator(s1); FSIterator iterPosPp = jcas.getAnnotationIndex(de.tudarmstadt.ukp.dkpro.core.type.pos.PP.type).subiterator(s1); FSIterator iterPosPr = jcas.getAnnotationIndex(de.tudarmstadt.ukp.dkpro.core.type.pos.PR.type).subiterator(s1); FSIterator iterPosPunc = jcas.getAnnotationIndex(de.tudarmstadt.ukp.dkpro.core.type.pos.PUNC.type).subiterator(s1); FSIterator iterPosV = jcas.getAnnotationIndex(de.tudarmstadt.ukp.dkpro.core.type.pos.V.type).subiterator(s1); while (iterPosAdj.hasNext()){ de.tudarmstadt.ukp.dkpro.core.type.pos.ADJ adj = (de.tudarmstadt.ukp.dkpro.core.type.pos.ADJ) iterPosAdj.next(); if ((adj.getBegin() == t2.getBegin()) && (adj.getEnd() == t2.getEnd())){ t2.setPos(adj.getValue()); } } while (iterPosAdv.hasNext()){ de.tudarmstadt.ukp.dkpro.core.type.pos.ADV adv = (de.tudarmstadt.ukp.dkpro.core.type.pos.ADV) iterPosAdv.next(); if ((adv.getBegin() == t2.getBegin()) && (adv.getEnd() == t2.getEnd())){ t2.setPos(adv.getValue()); } } while (iterPosArt.hasNext()){ de.tudarmstadt.ukp.dkpro.core.type.pos.ART art = (de.tudarmstadt.ukp.dkpro.core.type.pos.ART) iterPosArt.next(); if ((art.getBegin() == t2.getBegin()) && (art.getEnd() == t2.getEnd())){ t2.setPos(art.getValue()); } } while (iterPosCard.hasNext()){ de.tudarmstadt.ukp.dkpro.core.type.pos.CARD card = (de.tudarmstadt.ukp.dkpro.core.type.pos.CARD) iterPosCard.next(); if ((card.getBegin() == t2.getBegin()) && (card.getEnd() == t2.getEnd())){ t2.setPos(card.getValue()); } } while (iterPosConj.hasNext()){ de.tudarmstadt.ukp.dkpro.core.type.pos.CONJ conj = (de.tudarmstadt.ukp.dkpro.core.type.pos.CONJ) iterPosConj.next(); if ((conj.getBegin() == t2.getBegin()) && (conj.getEnd() == t2.getEnd())){ t2.setPos(conj.getValue()); } } while (iterPosNn.hasNext()){ de.tudarmstadt.ukp.dkpro.core.type.pos.NN nn = (de.tudarmstadt.ukp.dkpro.core.type.pos.NN) iterPosNn.next(); if ((nn.getBegin() == t2.getBegin()) && (nn.getEnd() == t2.getEnd())){ t2.setPos(nn.getValue()); } } while (iterPosNp.hasNext()){ de.tudarmstadt.ukp.dkpro.core.type.pos.NP np = (de.tudarmstadt.ukp.dkpro.core.type.pos.NP) iterPosNp.next(); if ((np.getBegin() == t2.getBegin()) && (np.getEnd() == t2.getEnd())){ t2.setPos(np.getValue()); } } while (iterPosO.hasNext()){ de.tudarmstadt.ukp.dkpro.core.type.pos.O o = (de.tudarmstadt.ukp.dkpro.core.type.pos.O) iterPosO.next(); if ((o.getBegin() == t2.getBegin()) && (o.getEnd() == t2.getEnd())){ t2.setPos(o.getValue()); } } while (iterPosPp.hasNext()){ de.tudarmstadt.ukp.dkpro.core.type.pos.PP pp = (de.tudarmstadt.ukp.dkpro.core.type.pos.PP) iterPosPp.next(); if ((pp.getBegin() == t2.getBegin()) && (pp.getEnd() == t2.getEnd())){ t2.setPos(pp.getValue()); } } while (iterPosPr.hasNext()){ de.tudarmstadt.ukp.dkpro.core.type.pos.PR pr = (de.tudarmstadt.ukp.dkpro.core.type.pos.PR) iterPosPr.next(); if ((pr.getBegin() == t2.getBegin()) && (pr.getEnd() == t2.getEnd())){ t2.setPos(pr.getValue()); } } while (iterPosPunc.hasNext()){ de.tudarmstadt.ukp.dkpro.core.type.pos.PUNC punc = (de.tudarmstadt.ukp.dkpro.core.type.pos.PUNC) iterPosPunc.next(); if ((punc.getBegin() == t2.getBegin()) && (punc.getEnd() == t2.getEnd())){ t2.setPos(punc.getValue()); } } while (iterPosV.hasNext()){ de.tudarmstadt.ukp.dkpro.core.type.pos.V v = (de.tudarmstadt.ukp.dkpro.core.type.pos.V) iterPosV.next(); if ((v.getBegin() == t2.getBegin()) && (v.getEnd() == t2.getEnd())){ t2.setPos(v.getValue()); } } t2.addToIndexes(); hsRemoveDkproTok.add(t1); } } // remove DKPro sentences finally for (de.tudarmstadt.ukp.dkpro.core.type.Sentence s : hsRemoveDkproSent) { s.removeFromIndexes(); } // remove DKPro tokens, finally for (de.tudarmstadt.ukp.dkpro.core.type.Token t1 : hsRemoveDkproTok) { t1.removeFromIndexes(); } } // IMPROVE SENTENCE BOUNDARIES (GERMAN SENTENCE SPLITTER) HashSet<de.unihd.dbs.uima.types.heideltime.Sentence> hsRemoveAnnotations = new HashSet<de.unihd.dbs.uima.types.heideltime.Sentence>(); HashSet<de.unihd.dbs.uima.types.heideltime.Sentence> hsAddAnnotations = new HashSet<de.unihd.dbs.uima.types.heideltime.Sentence>(); if (improveSentDe){ Boolean changes = true; while (changes){ changes = false; FSIndex annoHeidelSentences = jcas.getAnnotationIndex(de.unihd.dbs.uima.types.heideltime.Sentence.type); FSIterator iterHeidelSent = annoHeidelSentences.iterator(); while (iterHeidelSent.hasNext()){ de.unihd.dbs.uima.types.heideltime.Sentence s1 = (de.unihd.dbs.uima.types.heideltime.Sentence) iterHeidelSent.next(); int substringOffset = java.lang.Math.max(s1.getCoveredText().length()-4,1); if (s1.getCoveredText().substring(substringOffset).matches(".*[\\d]+\\.[\\s\\n]*$")){ // System.err.println("Checking sentence 1: successful: "+s1.getCoveredText()); if (iterHeidelSent.hasNext()){ de.unihd.dbs.uima.types.heideltime.Sentence s2 = (de.unihd.dbs.uima.types.heideltime.Sentence) iterHeidelSent.next(); iterHeidelSent.moveToPrevious(); // System.err.println("Checking sentence 2: "+s2.getCoveredText()); for (String beg : hsSentenceBeginnings){ if (s2.getCoveredText().startsWith(beg)){ // System.err.println("Checking sentence 2: successful"); de.unihd.dbs.uima.types.heideltime.Sentence s3 = new de.unihd.dbs.uima.types.heideltime.Sentence(jcas); s3.setBegin(s1.getBegin()); s3.setEnd(s2.getEnd()); hsAddAnnotations.add(s3); hsRemoveAnnotations.add(s1); hsRemoveAnnotations.add(s2); changes = true; break; } } } } } for (de.unihd.dbs.uima.types.heideltime.Sentence s : hsRemoveAnnotations){ s.removeFromIndexes(jcas); } hsRemoveAnnotations.clear(); for (de.unihd.dbs.uima.types.heideltime.Sentence s : hsAddAnnotations){ s.addToIndexes(jcas); } hsAddAnnotations.clear(); } } } }