package de.unihd.dbs.uima.annotator.intervaltagger;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.List;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.unihd.dbs.uima.annotator.heideltime.resources.Language;
import de.unihd.dbs.uima.annotator.heideltime.resources.RePatternManager;
import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
import de.unihd.dbs.uima.annotator.heideltime.utilities.Toolbox;
import de.unihd.dbs.uima.types.heideltime.IntervalCandidateSentence;
import de.unihd.dbs.uima.types.heideltime.Sentence;
import de.unihd.dbs.uima.types.heideltime.Timex3;
import de.unihd.dbs.uima.types.heideltime.Timex3Interval;
/**
* IntervalTagger is a UIMA annotator that discovers and tags intervals in documents.
* @author Manuel Dewald, Julian Zell
*
*/
public class IntervalTagger extends JCasAnnotator_ImplBase {
// TOOL NAME (may be used as componentId)
private Class<?> component = this.getClass();
// descriptor parameter names
public static String PARAM_LANGUAGE = "language";
public static String PARAM_INTERVALS = "annotate_intervals";
public static String PARAM_INTERVAL_CANDIDATES = "annotate_interval_candidates";
// descriptor configuration
private Language language = null;
private Boolean find_intervals = true;
private Boolean find_interval_candidates = true;
private HashMap<Pattern, String> hmIntervalPattern = new HashMap<Pattern, String>();
private HashMap<String, String> hmIntervalNormalization = new HashMap<String, String>();
/**
* initialization: read configuration parameters and resources
*/
public void initialize(UimaContext aContext) throws ResourceInitializationException {
super.initialize(aContext);
language = Language.getLanguageFromString((String) aContext.getConfigParameterValue(PARAM_LANGUAGE));
find_intervals = (Boolean) aContext.getConfigParameterValue(PARAM_INTERVALS);
find_interval_candidates = (Boolean) aContext.getConfigParameterValue(PARAM_INTERVAL_CANDIDATES);
readResources(readResourcesFromDirectory("rules"));
}
/**
* called by the pipeline to process the document
*/
public void process(JCas jcas) throws AnalysisEngineProcessException {
if(find_intervals) {
findIntervals(jcas);
findSentenceIntervals(jcas);
}
}
/**
* reads in heideltime's resource files.
* @throws ResourceInitializationException
*/
private void readResources(HashMap<String, String> hmResourcesRules) throws ResourceInitializationException {
Pattern paReadRules = Pattern.compile("RULENAME=\"(.*?)\",EXTRACTION=\"(.*?)\",NORM_VALUE=\"(.*?)\"(.*)");
// read normalization data
try {
for (String resource : hmResourcesRules.keySet()) {
BufferedReader br = new BufferedReader(new InputStreamReader (this.getClass().getClassLoader().getResourceAsStream(hmResourcesRules.get(resource))));
Logger.printDetail(component, "Adding rule resource: "+resource);
for ( String line; (line=br.readLine()) != null; ) {
if (!(line.startsWith("//"))) {
boolean correctLine = false;
if (!(line.equals(""))) {
Logger.printDetail("DEBUGGING: reading rules..."+ line);
// check each line for the name, extraction, and normalization part
for (MatchResult r : Toolbox.findMatches(paReadRules, line)) {
correctLine = true;
String rule_name = r.group(1);
String rule_extraction = r.group(2);
String rule_normalization = r.group(3);
////////////////////////////////////////////////////////////////////
// RULE EXTRACTION PARTS ARE TRANSLATED INTO REGULAR EXPRESSSIONS //
////////////////////////////////////////////////////////////////////
// create pattern for rule extraction part
Pattern paVariable = Pattern.compile("%(re[a-zA-Z0-9]*)");
RePatternManager rpm = RePatternManager.getInstance(language);
for (MatchResult mr : Toolbox.findMatches(paVariable,rule_extraction)) {
Logger.printDetail("DEBUGGING: replacing patterns..."+ mr.group());
if (!(rpm.containsKey(mr.group(1)))) {
Logger.printError("Error creating rule:"+rule_name);
Logger.printError("The following pattern used in this rule does not exist, does it? %"+mr.group(1));
System.exit(-1);
}
rule_extraction = rule_extraction.replaceAll("%"+mr.group(1), rpm.get(mr.group(1)));
}
rule_extraction = rule_extraction.replaceAll(" ", "[\\\\s]+");
Pattern pattern = null;
try{
pattern = Pattern.compile(rule_extraction);
}
catch (java.util.regex.PatternSyntaxException e) {
Logger.printError("Compiling rules resulted in errors.");
Logger.printError("Problematic rule is "+rule_name);
Logger.printError("Cannot compile pattern: "+rule_extraction);
e.printStackTrace();
System.exit(-1);
}
/////////////////////////////////////////////////
// READ INTERVAL RULES AND MAKE THEM AVAILABLE //
/////////////////////////////////////////////////
if(resource.equals("intervalrules")){
hmIntervalPattern.put(pattern,rule_name);
hmIntervalNormalization.put(rule_name, rule_normalization);
}
}
}
///////////////////////////////////////////
// CHECK FOR PROBLEMS WHEN READING RULES //
///////////////////////////////////////////
if ((correctLine == false) && (!(line.matches("")))) {
Logger.printError(component, "Cannot read the following line of rule resource "+resource);
Logger.printError(component, "Line: "+line);
}
}
}
}
} catch (IOException e) {
e.printStackTrace();
throw new ResourceInitializationException();
}
}
/**
* Reads resource files of the type resourceType from the "used_resources.txt" file and returns a HashMap
* containing information to access these resources.
* @return HashMap containing filename/path tuples
*/
protected HashMap<String, String> readResourcesFromDirectory(String resourceType) {
HashMap<String, String> hmResources = new HashMap<String, String>();
BufferedReader br = new BufferedReader(new InputStreamReader(this.getClass().getClassLoader().getResourceAsStream("used_resources.txt")));
try {
for (String line; (line=br.readLine()) != null; ) {
String pathDelim = System.getProperty("file.separator");
Pattern paResource = Pattern.compile(".(?:\\"+pathDelim+"|/)?(\\"+pathDelim+"|/)"+language.getResourceFolder()+"(?:\\"+pathDelim+"|/)"+resourceType+"(?:\\"+pathDelim+"|/)"+"resources_"+resourceType+"_"+"(.*?)\\.txt");
for (MatchResult ro : Toolbox.findMatches(paResource, line)){
pathDelim = ro.group(1);
String foundResource = ro.group(2);
String pathToResource = language.getResourceFolder()+ro.group(1)+resourceType+ro.group(1)+"resources_"+resourceType+"_"+foundResource+".txt";
hmResources.put(foundResource, pathToResource);
}
}
} catch (IOException e) {
e.printStackTrace();
Logger.printError(component, "Failed to read a resource from used_resources.txt.");
System.exit(-1);
}
return hmResources;
}
/**
* Extract Timex3Intervals, delimited by two Timex3Intervals in a sentence.
* finsInterval needs to be run with jcas before.
* @param jcas
* @author Manuel Dewald
*/
private void findSentenceIntervals(JCas jcas){
FSIterator iterSentence = jcas.getAnnotationIndex(Sentence.type).iterator();
while (iterSentence.hasNext()) {
Sentence s=(Sentence)iterSentence.next();
String sString=s.getCoveredText();
FSIterator iterInter = jcas.getAnnotationIndex(Timex3Interval.type).subiterator(s);
int count=0;
List<Timex3Interval> txes=new ArrayList<Timex3Interval>();
List<Timex3Interval> sentenceTxes=new ArrayList<Timex3Interval>();
while(iterInter.hasNext()){
Timex3Interval t=(Timex3Interval)iterInter.next();
sString=sString.replace(t.getCoveredText(), "<TX3_"+count+">");
count++;
txes.add(t);
}
if(count>1){
if (find_interval_candidates){
IntervalCandidateSentence sI=new IntervalCandidateSentence(jcas);
sI.setBegin(s.getBegin());
sI.setEnd(s.getEnd());
sI.addToIndexes();
}
for(Pattern p: hmIntervalPattern.keySet()){
String name=hmIntervalPattern.get(p);
List<MatchResult>results=(List<MatchResult>)Toolbox.findMatches(p,sString);
if(results.size()>0){
//Interval in Sentence s found by Pattern p!
for(MatchResult r: results){
Pattern pNorm=Pattern.compile("group\\(([1-9]+)\\)-group\\(([1-9]+)\\)");
String norm=hmIntervalNormalization.get(name);
Matcher mNorm=pNorm.matcher(norm);
if(!mNorm.matches()){
System.err.println("Problem with the Norm in rule "+name);
}
Timex3Interval startTx=null,endTx=null;
try{
int startId=Integer.parseInt(mNorm.group(1));
int endId=Integer.parseInt(mNorm.group(2));
startTx=txes.get(Integer.parseInt(r.group(startId)));
endTx=txes.get(Integer.parseInt(r.group(endId)));
}catch(Exception e){
e.printStackTrace();
return;
}
Timex3Interval annotation=new Timex3Interval(jcas);
annotation.setBegin(startTx.getBegin()>endTx.getBegin()?endTx.getBegin():startTx.getBegin());
annotation.setEnd(startTx.getEnd()>endTx.getEnd()?startTx.getEnd():endTx.getEnd());
//Does the interval already exist,
//found by another pattern?
boolean duplicate=false;
for(Timex3Interval tx:sentenceTxes){
if(tx.getBegin()==annotation.getBegin() &&
tx.getEnd()==annotation.getEnd()){
duplicate=true;
break;
}
}
if(!duplicate){
annotation.setTimexValueEB(startTx.getTimexValueEB());
annotation.setTimexValueLB(startTx.getTimexValueEE());
annotation.setTimexValueEE(endTx.getTimexValueEB());
annotation.setTimexValueLE(endTx.getTimexValueEE());
annotation.setTimexType(startTx.getTimexType());
annotation.setFoundByRule(name);
annotation.addToIndexes();
sentenceTxes.add(annotation);
}
}
}
}
}
}
}
/**
* Build Timex3Interval-Annotations out of Timex3Annotations in jcas.
* @author Manuel Dewald
* @param jcas
*/
private void findIntervals(JCas jcas) {
ArrayList<Timex3Interval> newAnnotations = new ArrayList<Timex3Interval>();
FSIterator iterTimex3 = jcas.getAnnotationIndex(Timex3.type).iterator();
while (iterTimex3.hasNext()) {
Timex3Interval annotation=new Timex3Interval(jcas);
Timex3 timex3 = (Timex3) iterTimex3.next();
//DATE Pattern
Pattern pDate = Pattern.compile("(\\d+)(-(\\d+)(-(\\d+(T(\\d+)(:(\\d+)(:(\\d+))?)?)?))?)?");
Pattern pCentury = Pattern.compile("(\\d\\d)XX");
Pattern pDecate = Pattern.compile("(\\d\\d\\d)X");
Pattern pQuarter = Pattern.compile("(\\d+)-Q([1-4])");
Pattern pHalf = Pattern.compile("(\\d+)-H([1-2])");
Pattern pSeason = Pattern.compile("(\\d+)-(SP|SU|FA|WI)");
Pattern pWeek = Pattern.compile("(\\d+)-W(\\d+)");
Pattern pWeekend = Pattern.compile("(\\d+)-W(\\d+)-WE");
Pattern pTimeOfDay = Pattern.compile("(\\d+)-(\\d+)-(\\d+)T(AF|DT|MI|MO|EV|NI)");
Matcher mDate = pDate.matcher(timex3.getTimexValue());
Matcher mCentury= pCentury.matcher(timex3.getTimexValue());
Matcher mDecade = pDecate.matcher(timex3.getTimexValue());
Matcher mQuarter= pQuarter.matcher(timex3.getTimexValue());
Matcher mHalf = pHalf.matcher(timex3.getTimexValue());
Matcher mSeason = pSeason.matcher(timex3.getTimexValue());
Matcher mWeek = pWeek.matcher(timex3.getTimexValue());
Matcher mWeekend= pWeekend.matcher(timex3.getTimexValue());
Matcher mTimeOfDay= pTimeOfDay.matcher(timex3.getTimexValue());
boolean matchesDate=mDate.matches();
boolean matchesCentury=mCentury.matches();
boolean matchesDecade=mDecade.matches();
boolean matchesQuarter=mQuarter.matches();
boolean matchesHalf=mHalf.matches();
boolean matchesSeason=mSeason.matches();
boolean matchesWeek=mWeek.matches();
boolean matchesWeekend=mWeekend.matches();
boolean matchesTimeOfDay=mTimeOfDay.matches();
String beginYear, endYear;
String beginMonth, endMonth;
String beginDay, endDay;
String beginHour, endHour;
String beginMinute, endMinute;
String beginSecond, endSecond;
beginYear=endYear="UNDEF";
beginMonth="01";
endMonth="12";
beginDay="01";
endDay="31";
beginHour="00";
endHour="23";
beginMinute="00";
endMinute="59";
beginSecond="00";
endSecond="59";
if(matchesDate){
//Get Year(1)
beginYear=endYear=mDate.group(1);
//Get Month(3)
if(mDate.group(3)!=null){
beginMonth=endMonth=mDate.group(3);
//Get Day(5)
if(mDate.group(5)==null){
Calendar c=Calendar.getInstance();
c.set(Integer.parseInt(beginYear), Integer.parseInt(beginMonth)-1, 1);
endDay=""+c.getActualMaximum(Calendar.DAY_OF_MONTH);
beginDay="01";
}else{
beginDay=endDay=mDate.group(5);
//Get Hour(7)
if(mDate.group(7)!=null){
beginHour=endHour=mDate.group(7);
//Get Minute(9)
if(mDate.group(9)!=null){
beginMinute=endMinute=mDate.group(9);
//Get Second(11)
if(mDate.group(11)!=null){
beginSecond=endSecond=mDate.group(11);
}
}
}
}
}
}else if(matchesCentury){
beginYear=mCentury.group(1)+"00";
endYear=mCentury.group(1)+"99";
}else if(matchesDecade){
beginYear=mDecade.group(1)+"0";
endYear=mDecade.group(1)+"9";
}else if(matchesQuarter){
beginYear=endYear=mQuarter.group(1);
int beginMonthI=3*(Integer.parseInt(mQuarter.group(2))-1)+1;
beginMonth=""+beginMonthI;
endMonth=""+(beginMonthI+2);
Calendar c=Calendar.getInstance();
c.set(Integer.parseInt(beginYear), Integer.parseInt(endMonth)-1, 1);
endDay=""+c.getActualMaximum(Calendar.DAY_OF_MONTH);
}else if(matchesHalf){
beginYear=endYear=mHalf.group(1);
int beginMonthI=6*(Integer.parseInt(mHalf.group(2))-1)+1;
beginMonth=""+beginMonthI;
endMonth=""+(beginMonthI+5);
Calendar c=Calendar.getInstance();
c.set(Integer.parseInt(beginYear), Integer.parseInt(endMonth)-1, 1);
endDay=""+c.getActualMaximum(Calendar.DAY_OF_MONTH);
}else if(matchesSeason){
beginYear=mSeason.group(1);
endYear=beginYear;
if(mSeason.group(2).equals("SP")){
beginMonth="03";
beginDay="21";
endMonth="06";
endDay="20";
}else if(mSeason.group(2).equals("SU")){
beginMonth="06";
beginDay="21";
endMonth="09";
endDay="22";
}else if(mSeason.group(2).equals("FA")){
beginMonth="09";
beginDay="23";
endMonth="12";
endDay="21";
}else if(mSeason.group(2).equals("WI")){
endYear=""+(Integer.parseInt(beginYear)+1);
beginMonth="12";
beginDay="22";
endMonth="03";
endDay="20";
}
}else if(matchesWeek){
beginYear=endYear=mWeek.group(1);
Calendar c=Calendar.getInstance();
c.setFirstDayOfWeek(Calendar.MONDAY);
c.set(Calendar.YEAR,Integer.parseInt(beginYear));
c.set(Calendar.WEEK_OF_YEAR, Integer.parseInt(mWeek.group(2)));
c.set(Calendar.DAY_OF_WEEK, Calendar.MONDAY);
beginDay=""+c.get(Calendar.DAY_OF_MONTH);
beginMonth=""+(c.get(Calendar.MONTH)+1);
c.set(Calendar.DAY_OF_WEEK, Calendar.SUNDAY);
endDay=""+(c.get(Calendar.DAY_OF_MONTH));
endMonth=""+(c.get(Calendar.MONTH)+1);
}else if(matchesWeekend){
beginYear=endYear=mWeekend.group(1);
Calendar c=Calendar.getInstance();
c.setFirstDayOfWeek(Calendar.MONDAY);
c.set(Calendar.YEAR,Integer.parseInt(beginYear));
c.set(Calendar.WEEK_OF_YEAR, Integer.parseInt(mWeekend.group(2)));
c.set(Calendar.DAY_OF_WEEK, Calendar.SATURDAY);
beginDay=""+c.get(Calendar.DAY_OF_MONTH);
beginMonth=""+(c.get(Calendar.MONTH)+1);
c.set(Calendar.DAY_OF_WEEK, Calendar.SUNDAY);
endDay=""+(c.get(Calendar.DAY_OF_MONTH));
endMonth=""+(c.get(Calendar.MONTH)+1);
}else if(matchesTimeOfDay){
beginYear=endYear=mTimeOfDay.group(1);
beginMonth=endMonth=mTimeOfDay.group(2);
beginDay=endDay=mTimeOfDay.group(3);
}
if(!beginYear.equals("UNDEF") && !endYear.equals("UNDEF")){
annotation.setTimexValueEB(beginYear+"-"+beginMonth+"-"+beginDay+"T"+beginHour+":"+beginMinute+":"+beginSecond);
annotation.setTimexValueEE(endYear+"-"+endMonth+"-"+endDay+"T"+endHour+":"+endMinute+":"+endSecond);
annotation.setTimexValueLB(beginYear+"-"+beginMonth+"-"+beginDay+"T"+beginHour+":"+beginMinute+":"+beginSecond);
annotation.setTimexValueLE(endYear+"-"+endMonth+"-"+endDay+"T"+endHour+":"+endMinute+":"+endSecond);
//Copy Values from the Timex3 Annotation
annotation.setTimexFreq(timex3.getTimexFreq());
annotation.setTimexId(timex3.getTimexId());
annotation.setTimexInstance(timex3.getTimexInstance());
annotation.setTimexMod(timex3.getTimexMod());
annotation.setTimexQuant(timex3.getTimexMod());
annotation.setTimexType(timex3.getTimexType());
annotation.setTimexValue(timex3.getTimexValue());
annotation.setSentId(timex3.getSentId());
annotation.setBegin(timex3.getBegin());
annotation.setFoundByRule(timex3.getFoundByRule());
annotation.setEnd(timex3.getEnd());
annotation.setAllTokIds(timex3.getAllTokIds());
annotation.setFilename(timex3.getFilename());
// remember this one for addition to indexes later
newAnnotations.add(annotation);
}
}
// add to indexes
for(Timex3Interval t3i : newAnnotations)
t3i.addToIndexes();
}
}