package de.unihd.dbs.uima.annotator.intervaltagger; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Calendar; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.regex.MatchResult; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.FSIterator; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.unihd.dbs.uima.annotator.heideltime.resources.Language; import de.unihd.dbs.uima.annotator.heideltime.resources.RePatternManager; import de.unihd.dbs.uima.annotator.heideltime.resources.ResourceMap; import de.unihd.dbs.uima.annotator.heideltime.resources.ResourceScanner; import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; import de.unihd.dbs.uima.annotator.heideltime.utilities.Toolbox; import de.unihd.dbs.uima.types.heideltime.IntervalCandidateSentence; import de.unihd.dbs.uima.types.heideltime.Sentence; import de.unihd.dbs.uima.types.heideltime.Timex3; import de.unihd.dbs.uima.types.heideltime.Timex3Interval; /** * IntervalTagger is a UIMA annotator that discovers and tags intervals in documents. * @author Manuel Dewald, Julian Zell * */ public class IntervalTagger extends JCasAnnotator_ImplBase { // TOOL NAME (may be used as componentId) private Class<?> component = this.getClass(); // descriptor parameter names public static String PARAM_LANGUAGE = "language"; public static String PARAM_INTERVALS = "annotate_intervals"; public static String PARAM_INTERVAL_CANDIDATES = "annotate_interval_candidates"; // descriptor configuration private Language language = null; private Boolean find_intervals = true; private Boolean find_interval_candidates = true; private HashMap<Pattern, String> hmIntervalPattern = new HashMap<Pattern, String>(); private HashMap<String, String> hmIntervalNormalization = new HashMap<String, String>(); /** * initialization: read configuration parameters and resources */ public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); language = Language.getLanguageFromString((String) aContext.getConfigParameterValue(PARAM_LANGUAGE)); find_intervals = (Boolean) aContext.getConfigParameterValue(PARAM_INTERVALS); find_interval_candidates = (Boolean) aContext.getConfigParameterValue(PARAM_INTERVAL_CANDIDATES); ResourceScanner rs = ResourceScanner.getInstance(); readResources(rs.getRules(language.getName())); } /** * called by the pipeline to process the document */ public void process(JCas jcas) throws AnalysisEngineProcessException { if(find_intervals) { findIntervals(jcas); findSentenceIntervals(jcas); } } /** * reads in heideltime's resource files. * @throws ResourceInitializationException */ private void readResources(ResourceMap hmResourcesRules) throws ResourceInitializationException { Pattern paReadRules = Pattern.compile("RULENAME=\"(.*?)\",EXTRACTION=\"(.*?)\",NORM_VALUE=\"(.*?)\"(.*)"); Pattern paVariable = Pattern.compile("%(re[a-zA-Z0-9]*)"); // read normalization data InputStream is = null; InputStreamReader isr = null; BufferedReader br = null; try { for (String resource : hmResourcesRules.keySet()) { is = hmResourcesRules.getInputStream(resource); isr = new InputStreamReader(is, "UTF-8"); br = new BufferedReader(isr); Logger.printDetail(component, "Adding rule resource: " + resource); for(String line; (line = br.readLine()) != null; ) { if(line.startsWith("//") || line.equals("")) { continue; } Logger.printDetail("DEBUGGING: reading rules..."+ line); // check each line for the name, extraction, and normalization part for (MatchResult r : Toolbox.findMatches(paReadRules, line)) { String rule_name = r.group(1); String rule_extraction = r.group(2); String rule_normalization = r.group(3); //////////////////////////////////////////////////////////////////// // RULE EXTRACTION PARTS ARE TRANSLATED INTO REGULAR EXPRESSSIONS // //////////////////////////////////////////////////////////////////// // create pattern for rule extraction part RePatternManager rpm = RePatternManager.getInstance(language, false); for (MatchResult mr : Toolbox.findMatches(paVariable,rule_extraction)) { Logger.printDetail("DEBUGGING: replacing patterns..."+ mr.group()); if (!(rpm.containsKey(mr.group(1)))) { Logger.printError("Error creating rule:"+rule_name); Logger.printError("The following pattern used in this rule does not exist, does it? %"+mr.group(1)); System.exit(-1); } rule_extraction = rule_extraction.replaceAll("%"+mr.group(1), rpm.get(mr.group(1))); } rule_extraction = rule_extraction.replaceAll(" ", "[\\\\s]+"); Pattern pattern = null; try{ pattern = Pattern.compile(rule_extraction); } catch (java.util.regex.PatternSyntaxException e) { Logger.printError("Compiling rules resulted in errors."); Logger.printError("Problematic rule is "+rule_name); Logger.printError("Cannot compile pattern: "+rule_extraction); e.printStackTrace(); System.exit(-1); } ///////////////////////////////////////////////// // READ INTERVAL RULES AND MAKE THEM AVAILABLE // ///////////////////////////////////////////////// if(resource.equals("intervalrules")){ hmIntervalPattern.put(pattern,rule_name); hmIntervalNormalization.put(rule_name, rule_normalization); } } } } } catch (IOException e) { e.printStackTrace(); throw new ResourceInitializationException(); } finally { try { if(br != null) { br.close(); } if(isr != null) { isr.close(); } if(is != null) { is.close(); } } catch(Exception e) { e.printStackTrace(); } } } /** * Extract Timex3Intervals, delimited by two Timex3Intervals in a sentence. * finsInterval needs to be run with jcas before. * @param jcas * @author Manuel Dewald */ private void findSentenceIntervals(JCas jcas){ HashSet<Timex3Interval> timexesToRemove = new HashSet<Timex3Interval>(); FSIterator iterSentence = jcas.getAnnotationIndex(Sentence.type).iterator(); while (iterSentence.hasNext()) { Sentence s=(Sentence)iterSentence.next(); String sString=s.getCoveredText(); FSIterator iterInter = jcas.getAnnotationIndex(Timex3Interval.type).subiterator(s); int count=0; List<Timex3Interval> txes=new ArrayList<Timex3Interval>(); List<Timex3Interval> sentenceTxes=new ArrayList<Timex3Interval>(); while(iterInter.hasNext()){ Timex3Interval t=(Timex3Interval)iterInter.next(); sString=sString.replace(t.getCoveredText(), "<TX3_"+count+">"); count++; txes.add(t); } if(count>0){ if (find_interval_candidates){ IntervalCandidateSentence sI=new IntervalCandidateSentence(jcas); sI.setBegin(s.getBegin()); sI.setEnd(s.getEnd()); sI.addToIndexes(); } for(Pattern p: hmIntervalPattern.keySet()){ String name=hmIntervalPattern.get(p); List<MatchResult>results=(List<MatchResult>)Toolbox.findMatches(p,sString); if(results.size()>0){ //Interval in Sentence s found by Pattern p! for(MatchResult r: results){ Pattern pNorm=Pattern.compile("group\\(([1-9]+)\\)-group\\(([1-9]+)\\)"); String norm=hmIntervalNormalization.get(name); Matcher mNorm=pNorm.matcher(norm); if(!mNorm.matches()){ System.err.println("Problem with the Norm in rule "+name); } Timex3Interval startTx=null,endTx=null; try{ int startId=Integer.parseInt(mNorm.group(1)); int endId=Integer.parseInt(mNorm.group(2)); startTx=txes.get(Integer.parseInt(r.group(startId))); endTx=txes.get(Integer.parseInt(r.group(endId))); }catch(Exception e){ e.printStackTrace(); return; } Timex3Interval annotation=new Timex3Interval(jcas); annotation.setBegin(startTx.getBegin()>endTx.getBegin()?endTx.getBegin():startTx.getBegin()); annotation.setEnd(startTx.getEnd()>endTx.getEnd()?startTx.getEnd():endTx.getEnd()); //Does the interval already exist, //found by another pattern? boolean duplicate=false; for(Timex3Interval tx:sentenceTxes){ if(tx.getBegin()==annotation.getBegin() && tx.getEnd()==annotation.getEnd()){ duplicate=true; break; } } if(!duplicate){ annotation.setTimexValueEB(startTx.getTimexValueEB()); annotation.setTimexValueLB(startTx.getTimexValueLE()); annotation.setTimexValueEE(endTx.getTimexValueEB()); annotation.setTimexValueLE(endTx.getTimexValueLE()); annotation.setTimexType(startTx.getTimexType()); annotation.setFoundByRule(name); // create emptyvalue value String emptyValue = createEmptyValue(startTx, endTx, jcas); annotation.setEmptyValue(emptyValue); annotation.setBeginTimex(startTx.getBeginTimex()); annotation.setEndTimex(endTx.getEndTimex()); try { sentenceTxes.add(annotation); } catch(NumberFormatException e) { Logger.printError(component, "Couldn't do emptyValue calculation on accont of a faulty normalization in " + annotation.getTimexValueEB() + " or " + annotation.getTimexValueEE()); } // prepare tx3intervals to remove timexesToRemove.add(startTx); timexesToRemove.add(endTx); annotation.addToIndexes(); // System.out.println(emptyValue); } } } } } } for(Timex3Interval txi : timexesToRemove) { txi.removeFromIndexes(); } } private String createEmptyValue(Timex3Interval startTx, Timex3Interval endTx, JCas jcas) throws NumberFormatException { String dateStr = "", timeStr = ""; // find granularity for start/end timex values Pattern p = Pattern.compile("(\\d{1,4})?-?(\\d{2})?-?(\\d{2})?(T)?(\\d{2})?:?(\\d{2})?:?(\\d{2})?"); // 1 2 3 4 5 6 7 Matcher mStart = p.matcher(startTx.getTimexValue()); Matcher mEnd = p.matcher(endTx.getTimexValue()); Integer granularityStart = -1; Integer granularityEnd = -2; Integer granularity = -1; // find the highest granularity in each timex if(mStart.find() && mEnd.find()) { for(Integer i = 1; i <= mStart.groupCount(); i++) { if(mStart.group(i) != null) granularityStart = i; if(mEnd.group(i) != null) granularityEnd = i; } } // if granularities aren't the same, we can't do anything here. if(granularityEnd != granularityStart) { return ""; } else { // otherwise, set maximum granularity granularity = granularityStart; } // check all the different granularities, starting with seconds, calculate differences, add carries Integer myYears = 0, myMonths = 0, myDays = 0, myHours = 0, myMinutes = 0, mySeconds = 0; if(granularity >= 7 && mStart.group(7) != null && mEnd.group(7) != null) { mySeconds = Integer.parseInt(mEnd.group(7)) - Integer.parseInt(mStart.group(7)); if(mySeconds < 0) { mySeconds += 60; myMinutes -= 1; } } if(granularity >= 6 && mStart.group(6) != null && mEnd.group(6) != null) { myMinutes += Integer.parseInt(mEnd.group(6)) - Integer.parseInt(mStart.group(6)); if(myMinutes < 0) { myMinutes += 60; myHours -= 1; } } if(granularity >= 5 && mStart.group(5) != null && mEnd.group(5) != null) { myHours += Integer.parseInt(mEnd.group(5)) - Integer.parseInt(mStart.group(5)); if(myHours < 0) { myMinutes += 24; myDays -= 1; } } if(granularity >= 3 && mStart.group(3) != null && mEnd.group(3) != null) { myDays += Integer.parseInt(mEnd.group(3)) - Integer.parseInt(mStart.group(3)); if(myDays < 0) { Calendar cal = Calendar.getInstance(); cal.set(Calendar.YEAR, Integer.parseInt(mStart.group(1))); cal.set(Calendar.MONTH, Integer.parseInt(mStart.group(2))); myMonths = myMonths - 1; myDays += cal.getActualMaximum(Calendar.DAY_OF_MONTH); } } if(granularity >= 2 && mStart.group(2) != null && mEnd.group(2) != null) { myMonths += Integer.parseInt(mEnd.group(2)) - Integer.parseInt(mStart.group(2)); if(myMonths < 0) { myMonths += Integer.parseInt(mStart.group(2)); myYears -= 1; } } String myYearUnit = ""; if(granularity >= 1 && mStart.group(1) != null && mEnd.group(1) != null) { String year1str = mStart.group(1), year2str = mEnd.group(1); // trim year strings to same length (NNNN year, NNN decade, NN century) while(year2str.length() > year1str.length()) year2str = year2str.substring(0, year2str.length()-1); while(year1str.length() > year2str.length()) year1str = year1str.substring(0, year1str.length()-1); // check for year unit switch(year1str.length()) { case 2: myYearUnit = "CE"; myYears = Integer.parseInt(year2str) - Integer.parseInt(year1str); break; case 3: myYearUnit = "DE"; myYears = Integer.parseInt(year2str) - Integer.parseInt(year1str); break; case 4: myYearUnit = "Y"; myYears += Integer.parseInt(year2str) - Integer.parseInt(year1str); break; default: break; } } // assemble strings dateStr += (myYears > 0 ? myYears + myYearUnit : ""); dateStr += (myMonths > 0 ? myMonths + "M" : ""); dateStr += (myDays > 0 ? myDays + "D" : ""); timeStr += (myHours > 0 ? myHours + "H" : ""); timeStr += (myMinutes > 0 ? myMinutes + "M" : ""); timeStr += (mySeconds > 0 ? mySeconds + "S" : ""); // output return "P" + dateStr + (timeStr.length() > 0 ? "T" + timeStr : ""); } /** * Build Timex3Interval-Annotations out of Timex3Annotations in jcas. * @author Manuel Dewald * @param jcas */ private void findIntervals(JCas jcas) { ArrayList<Timex3Interval> newAnnotations = new ArrayList<Timex3Interval>(); FSIterator iterTimex3 = jcas.getAnnotationIndex(Timex3.type).iterator(); while (iterTimex3.hasNext()) { Timex3Interval annotation=new Timex3Interval(jcas); Timex3 timex3 = (Timex3) iterTimex3.next(); //DATE Pattern Pattern pDate = Pattern.compile("(?:BC)?(\\d\\d\\d\\d)(-(\\d+))?(-(\\d+))?(T(\\d+))?(:(\\d+))?(:(\\d+))?"); Pattern pCentury = Pattern.compile("(\\d\\d)"); Pattern pDecate = Pattern.compile("(\\d\\d\\d)"); Pattern pQuarter = Pattern.compile("(\\d+)-Q([1-4])"); Pattern pHalf = Pattern.compile("(\\d+)-H([1-2])"); Pattern pSeason = Pattern.compile("(\\d+)-(SP|SU|FA|WI)"); Pattern pWeek = Pattern.compile("(\\d+)-W(\\d+)"); Pattern pWeekend = Pattern.compile("(\\d+)-W(\\d+)-WE"); Pattern pTimeOfDay = Pattern.compile("(\\d+)-(\\d+)-(\\d+)T(AF|DT|MI|MO|EV|NI)"); Matcher mDate = pDate.matcher(timex3.getTimexValue()); Matcher mCentury= pCentury.matcher(timex3.getTimexValue()); Matcher mDecade = pDecate.matcher(timex3.getTimexValue()); Matcher mQuarter= pQuarter.matcher(timex3.getTimexValue()); Matcher mHalf = pHalf.matcher(timex3.getTimexValue()); Matcher mSeason = pSeason.matcher(timex3.getTimexValue()); Matcher mWeek = pWeek.matcher(timex3.getTimexValue()); Matcher mWeekend= pWeekend.matcher(timex3.getTimexValue()); Matcher mTimeOfDay= pTimeOfDay.matcher(timex3.getTimexValue()); boolean matchesDate=mDate.matches(); boolean matchesCentury=mCentury.matches(); boolean matchesDecade=mDecade.matches(); boolean matchesQuarter=mQuarter.matches(); boolean matchesHalf=mHalf.matches(); boolean matchesSeason=mSeason.matches(); boolean matchesWeek=mWeek.matches(); boolean matchesWeekend=mWeekend.matches(); boolean matchesTimeOfDay=mTimeOfDay.matches(); String beginYear, endYear; String beginMonth, endMonth; String beginDay, endDay; String beginHour, endHour; String beginMinute, endMinute; String beginSecond, endSecond; beginYear=endYear="UNDEF"; beginMonth="01"; endMonth="12"; beginDay="01"; endDay="31"; beginHour="00"; endHour="23"; beginMinute="00"; endMinute="59"; beginSecond="00"; endSecond="59"; if(matchesDate){ //Get Year(1) beginYear=endYear=mDate.group(1); //Get Month(3) if(mDate.group(3)!=null){ beginMonth=endMonth=mDate.group(3); //Get Day(5) if(mDate.group(5)==null){ Calendar c=Calendar.getInstance(); c.set(Integer.parseInt(beginYear), Integer.parseInt(beginMonth)-1, 1); endDay=""+c.getActualMaximum(Calendar.DAY_OF_MONTH); beginDay="01"; }else{ beginDay=endDay=mDate.group(5); //Get Hour(7) if(mDate.group(7)!=null){ beginHour=endHour=mDate.group(7); //Get Minute(9) if(mDate.group(9)!=null){ beginMinute=endMinute=mDate.group(9); //Get Second(11) if(mDate.group(11)!=null){ beginSecond=endSecond=mDate.group(11); } } } } } }else if(matchesCentury){ beginYear=mCentury.group(1)+"00"; endYear=mCentury.group(1)+"99"; }else if(matchesDecade){ beginYear=mDecade.group(1)+"0"; endYear=mDecade.group(1)+"9"; }else if(matchesQuarter){ beginYear=endYear=mQuarter.group(1); int beginMonthI=3*(Integer.parseInt(mQuarter.group(2))-1)+1; beginMonth=""+beginMonthI; endMonth=""+(beginMonthI+2); Calendar c=Calendar.getInstance(); c.set(Integer.parseInt(beginYear), Integer.parseInt(endMonth)-1, 1); endDay=""+c.getActualMaximum(Calendar.DAY_OF_MONTH); }else if(matchesHalf){ beginYear=endYear=mHalf.group(1); int beginMonthI=6*(Integer.parseInt(mHalf.group(2))-1)+1; beginMonth=""+beginMonthI; endMonth=""+(beginMonthI+5); Calendar c=Calendar.getInstance(); c.set(Integer.parseInt(beginYear), Integer.parseInt(endMonth)-1, 1); endDay=""+c.getActualMaximum(Calendar.DAY_OF_MONTH); }else if(matchesSeason){ beginYear=mSeason.group(1); endYear=beginYear; if(mSeason.group(2).equals("SP")){ beginMonth="03"; beginDay="21"; endMonth="06"; endDay="20"; }else if(mSeason.group(2).equals("SU")){ beginMonth="06"; beginDay="21"; endMonth="09"; endDay="22"; }else if(mSeason.group(2).equals("FA")){ beginMonth="09"; beginDay="23"; endMonth="12"; endDay="21"; }else if(mSeason.group(2).equals("WI")){ endYear=""+(Integer.parseInt(beginYear)+1); beginMonth="12"; beginDay="22"; endMonth="03"; endDay="20"; } }else if(matchesWeek){ beginYear=endYear=mWeek.group(1); Calendar c=Calendar.getInstance(); c.setFirstDayOfWeek(Calendar.MONDAY); c.set(Calendar.YEAR,Integer.parseInt(beginYear)); c.set(Calendar.WEEK_OF_YEAR, Integer.parseInt(mWeek.group(2))); c.set(Calendar.DAY_OF_WEEK, Calendar.MONDAY); beginDay=""+c.get(Calendar.DAY_OF_MONTH); beginMonth=""+(c.get(Calendar.MONTH)+1); c.set(Calendar.DAY_OF_WEEK, Calendar.SUNDAY); endDay=""+(c.get(Calendar.DAY_OF_MONTH)); endMonth=""+(c.get(Calendar.MONTH)+1); }else if(matchesWeekend){ beginYear=endYear=mWeekend.group(1); Calendar c=Calendar.getInstance(); c.setFirstDayOfWeek(Calendar.MONDAY); c.set(Calendar.YEAR,Integer.parseInt(beginYear)); c.set(Calendar.WEEK_OF_YEAR, Integer.parseInt(mWeekend.group(2))); c.set(Calendar.DAY_OF_WEEK, Calendar.SATURDAY); beginDay=""+c.get(Calendar.DAY_OF_MONTH); beginMonth=""+(c.get(Calendar.MONTH)+1); c.set(Calendar.DAY_OF_WEEK, Calendar.SUNDAY); endDay=""+(c.get(Calendar.DAY_OF_MONTH)); endMonth=""+(c.get(Calendar.MONTH)+1); }else if(matchesTimeOfDay){ beginYear=endYear=mTimeOfDay.group(1); beginMonth=endMonth=mTimeOfDay.group(2); beginDay=endDay=mTimeOfDay.group(3); } // correct month and days < 10 if (Integer.parseInt(beginDay) < 10){ beginDay = "0" + Integer.parseInt(beginDay); } if (Integer.parseInt(beginMonth) < 10){ beginMonth = "0" + Integer.parseInt(beginMonth); } if (Integer.parseInt(endDay) < 10){ endDay = "0" + Integer.parseInt(endDay); } if (Integer.parseInt(endMonth) < 10){ endMonth = "0" + Integer.parseInt(endMonth); } if(!beginYear.equals("UNDEF") && !endYear.equals("UNDEF")){ annotation.setTimexValueEB(beginYear+"-"+beginMonth+"-"+beginDay+"T"+beginHour+":"+beginMinute+":"+beginSecond); // annotation.setTimexValueLB(beginYear+"-"+beginMonth+"-"+beginDay+"T"+endHour+":"+endMinute+":"+endSecond); // annotation.setTimexValueEE(endYear+"-"+endMonth+"-"+endDay+"T"+beginHour+":"+beginMinute+":"+beginSecond); annotation.setTimexValueLE(endYear+"-"+endMonth+"-"+endDay+"T"+endHour+":"+endMinute+":"+endSecond); annotation.setTimexValueLB(endYear+"-"+endMonth+"-"+endDay+"T"+endHour+":"+endMinute+":"+endSecond); annotation.setTimexValueEE(beginYear+"-"+beginMonth+"-"+beginDay+"T"+beginHour+":"+beginMinute+":"+beginSecond); //Copy Values from the Timex3 Annotation annotation.setTimexFreq(timex3.getTimexFreq()); annotation.setTimexId(timex3.getTimexId()); annotation.setTimexInstance(timex3.getTimexInstance()); annotation.setTimexMod(timex3.getTimexMod()); annotation.setTimexQuant(timex3.getTimexMod()); annotation.setTimexType(timex3.getTimexType()); annotation.setTimexValue(timex3.getTimexValue()); annotation.setSentId(timex3.getSentId()); annotation.setBegin(timex3.getBegin()); annotation.setFoundByRule(timex3.getFoundByRule()); annotation.setEnd(timex3.getEnd()); annotation.setAllTokIds(timex3.getAllTokIds()); annotation.setFilename(timex3.getFilename()); annotation.setBeginTimex(timex3.getTimexId()); annotation.setEndTimex(timex3.getTimexId()); // remember this one for addition to indexes later newAnnotations.add(annotation); } } // add to indexes for(Timex3Interval t3i : newAnnotations) t3i.addToIndexes(); } }