package de.unihd.dbs.uima.annotator.heideltime.utilities; import java.util.List; import java.util.TreeMap; import java.util.regex.MatchResult; import java.util.regex.Pattern; import org.apache.uima.cas.FSIterator; import org.apache.uima.jcas.JCas; import de.unihd.dbs.uima.annotator.heideltime.resources.Language; import de.unihd.dbs.uima.annotator.heideltime.resources.NormalizationManager; import de.unihd.dbs.uima.annotator.heideltime.resources.RePatternManager; import de.unihd.dbs.uima.types.heideltime.Sentence; import de.unihd.dbs.uima.types.heideltime.Timex3; import de.unihd.dbs.uima.types.heideltime.Token; /** * * This class contains methods that work with the dependence of a subject with its * surrounding data; namely via the jcas element or a subset list. * @author jannik stroetgen * */ public class ContextAnalyzer { /** * The value of the x of the last mentioned Timex is calculated. * @param linearDates list of previous linear dates * @param i index for the previous date entry * @param x type to search for * @return last mentioned entry */ public static String getLastMentionedX(List<Timex3> linearDates, int i, String x, Language language) { NormalizationManager nm = NormalizationManager.getInstance(language); // Timex for which to get the last mentioned x (i.e., Timex i) Timex3 t_i = linearDates.get(i); String xValue = ""; int j = i - 1; while (j >= 0) { Timex3 timex = linearDates.get(j); // check that the two timexes to compare do not have the same offset: if (!(t_i.getBegin() == timex.getBegin())) { String value = timex.getTimexValue(); if (!(value.contains("funcDate"))){ if (x.equals("century")) { if (value.matches("^[0-9][0-9]...*")) { xValue = value.substring(0,2); break; } else { j--; } } else if (x.equals("decade")) { if (value.matches("^[0-9][0-9][0-9]..*")) { xValue = value.substring(0,3); break; } else { j--; } } else if (x.equals("year")) { if (value.matches("^[0-9][0-9][0-9][0-9].*")) { xValue = value.substring(0,4); break; } else { j--; } } else if (x.equals("dateYear")) { if (value.matches("^[0-9][0-9][0-9][0-9].*")) { xValue = value; break; } else { j--; } } else if (x.equals("month")) { if (value.matches("^[0-9][0-9][0-9][0-9]-[0-9][0-9].*")) { xValue = value.substring(0,7); break; } else { j--; } } else if (x.equals("month-with-details")) { if (value.matches("^[0-9][0-9][0-9][0-9]-[0-9][0-9].*")) { xValue = value; break; } else { j--; } } else if (x.equals("day")) { if (value.matches("^[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9].*")) { xValue = value.substring(0,10); break; } else { j--; } } else if (x.equals("week")) { if (value.matches("^[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9].*")) { for (MatchResult r : Toolbox.findMatches(Pattern.compile("^(([0-9][0-9][0-9][0-9])-[0-9][0-9]-[0-9][0-9]).*"), value)) { xValue = r.group(2)+"-W"+DateCalculator.getWeekOfDate(r.group(1)); break; } break; } else if (value.matches("^[0-9][0-9][0-9][0-9]-W[0-9][0-9].*")) { for (MatchResult r : Toolbox.findMatches(Pattern.compile("^([0-9][0-9][0-9][0-9]-W[0-9][0-9]).*"), value)) { xValue = r.group(1); break; } break; } else { j--; } } else if (x.equals("quarter")) { if (value.matches("^[0-9][0-9][0-9][0-9]-[0-9][0-9].*")) { String month = value.substring(5,7); String quarter = nm.getFromNormMonthInQuarter(month); xValue = value.substring(0,4)+"-Q"+quarter; break; } else if (value.matches("^[0-9][0-9][0-9][0-9]-Q[1234].*")) { xValue = value.substring(0,7); break; } else { j--; } } else if (x.equals("dateQuarter")) { if (value.matches("^[0-9][0-9][0-9][0-9]-Q[1234].*")) { xValue = value.substring(0,7); break; } else { j--; } } else if (x.equals("season")) { if (value.matches("^[0-9][0-9][0-9][0-9]-[0-9][0-9].*")) { String month = value.substring(5,7); String season = nm.getFromNormMonthInSeason(month); xValue = value.substring(0,4)+"-"+season; break; } else if (value.matches("^[0-9][0-9][0-9][0-9]-(SP|SU|FA|WI).*")) { xValue = value.substring(0,7); break; } else { j--; } } } else { j--; } } } return xValue; } /** * Get the last tense used in the sentence * * @param timex timex construct to discover tense data for * @return string that contains the tense */ public static String getClosestTense(Timex3 timex, JCas jcas, Language language) { RePatternManager rpm = RePatternManager.getInstance(language); String lastTense = ""; String nextTense = ""; int tokenCounter = 0; int lastid = 0; int nextid = 0; int tid = 0; // Get the sentence FSIterator iterSentence = jcas.getAnnotationIndex(Sentence.type).iterator(); Sentence s = new Sentence(jcas); while (iterSentence.hasNext()) { s = (Sentence) iterSentence.next(); if ((s.getBegin() < timex.getBegin()) && (s.getEnd() > timex.getEnd())) { break; } } // Get the tokens TreeMap<Integer, Token> tmToken = new TreeMap<Integer, Token>(); FSIterator iterToken = jcas.getAnnotationIndex(Token.type).subiterator(s); while (iterToken.hasNext()) { Token token = (Token) iterToken.next(); tmToken.put(token.getEnd(), token); } // Get the last VERB token for (Integer tokEnd : tmToken.keySet()) { tokenCounter++; if (tokEnd < timex.getBegin()) { Token token = tmToken.get(tokEnd); Logger.printDetail("GET LAST TENSE: string:"+token.getCoveredText()+" pos:"+token.getPos()); Logger.printDetail("hmAllRePattern.containsKey(tensePos4PresentFuture):"+rpm.get("tensePos4PresentFuture")); Logger.printDetail("hmAllRePattern.containsKey(tensePos4Future):"+rpm.get("tensePos4Future")); Logger.printDetail("hmAllRePattern.containsKey(tensePos4Past):"+rpm.get("tensePos4Past")); Logger.printDetail("CHECK TOKEN:"+token.getPos()); if (token.getPos() == null) { } else if ((rpm.containsKey("tensePos4PresentFuture")) && (token.getPos().matches(rpm.get("tensePos4PresentFuture")))) { lastTense = "PRESENTFUTURE"; lastid = tokenCounter; } else if ((rpm.containsKey("tensePos4Past")) && (token.getPos().matches(rpm.get("tensePos4Past")))) { lastTense = "PAST"; lastid = tokenCounter; } else if ((rpm.containsKey("tensePos4Future")) && (token.getPos().matches(rpm.get("tensePos4Future")))) { if (token.getCoveredText().matches(rpm.get("tenseWord4Future"))) { lastTense = "FUTURE"; lastid = tokenCounter; } } } else { if (tid == 0) { tid = tokenCounter; } } } tokenCounter = 0; for (Integer tokEnd : tmToken.keySet()) { tokenCounter++; if (nextTense.equals("")) { if (tokEnd > timex.getEnd()) { Token token = tmToken.get(tokEnd); Logger.printDetail("GET NEXT TENSE: string:"+token.getCoveredText()+" pos:"+token.getPos()); Logger.printDetail("hmAllRePattern.containsKey(tensePos4PresentFuture):"+rpm.get("tensePos4PresentFuture")); Logger.printDetail("hmAllRePattern.containsKey(tensePos4Future):"+rpm.get("tensePos4Future")); Logger.printDetail("hmAllRePattern.containsKey(tensePos4Past):"+rpm.get("tensePos4Past")); Logger.printDetail("CHECK TOKEN:"+token.getPos()); if (token.getPos() == null) { } else if ((rpm.containsKey("tensePos4PresentFuture")) && (token.getPos().matches(rpm.get("tensePos4PresentFuture")))) { nextTense = "PRESENTFUTURE"; nextid = tokenCounter; } else if ((rpm.containsKey("tensePos4Past")) && (token.getPos().matches(rpm.get("tensePos4Past")))) { nextTense = "PAST"; nextid = tokenCounter; } else if ((rpm.containsKey("tensePos4Future")) && (token.getPos().matches(rpm.get("tensePos4Future")))) { if (token.getCoveredText().matches(rpm.get("tenseWord4Future"))) { nextTense = "FUTURE"; nextid = tokenCounter; } } } } } if (lastTense.equals("")) { Logger.printDetail("TENSE: "+nextTense); return nextTense; } else if (nextTense.equals("")) { Logger.printDetail("TENSE: "+lastTense); return lastTense; } else { // If there is tense before and after the timex token, // return the closer one: if ((tid - lastid) > (nextid - tid)) { Logger.printDetail("TENSE: "+nextTense); return nextTense; } else { Logger.printDetail("TENSE: "+lastTense); return lastTense; } } } /** * Get the last tense used in the sentence * * @param timex timex construct to discover tense data for * @return string that contains the tense */ public static String getLastTense(Timex3 timex, JCas jcas, Language language) { RePatternManager rpm = RePatternManager.getInstance(language); String lastTense = ""; // Get the sentence FSIterator iterSentence = jcas.getAnnotationIndex(Sentence.type).iterator(); Sentence s = new Sentence(jcas); while (iterSentence.hasNext()) { s = (Sentence) iterSentence.next(); if ((s.getBegin() < timex.getBegin()) && (s.getEnd() > timex.getEnd())) { break; } } // Get the tokens TreeMap<Integer, Token> tmToken = new TreeMap<Integer, Token>(); FSIterator iterToken = jcas.getAnnotationIndex(Token.type).subiterator(s); while (iterToken.hasNext()) { Token token = (Token) iterToken.next(); tmToken.put(token.getEnd(), token); } // Get the last VERB token for (Integer tokEnd : tmToken.keySet()) { if (tokEnd < timex.getBegin()) { Token token = tmToken.get(tokEnd); Logger.printDetail("GET LAST TENSE: string:"+token.getCoveredText()+" pos:"+token.getPos()); Logger.printDetail("hmAllRePattern.containsKey(tensePos4PresentFuture):"+rpm.get("tensePos4PresentFuture")); Logger.printDetail("hmAllRePattern.containsKey(tensePos4Future):"+rpm.get("tensePos4Future")); Logger.printDetail("hmAllRePattern.containsKey(tensePos4Past):"+rpm.get("tensePos4Past")); Logger.printDetail("CHECK TOKEN:"+token.getPos()); if (token.getPos() == null) { } else if ((rpm.containsKey("tensePos4PresentFuture")) && (token.getPos().matches(rpm.get("tensePos4PresentFuture")))) { lastTense = "PRESENTFUTURE"; Logger.printDetail("this tense:"+lastTense); } else if ((rpm.containsKey("tensePos4Past")) && (token.getPos().matches(rpm.get("tensePos4Past")))) { lastTense = "PAST"; Logger.printDetail("this tense:"+lastTense); } else if ((rpm.containsKey("tensePos4Future")) && (token.getPos().matches(rpm.get("tensePos4Future")))) { if (token.getCoveredText().matches(rpm.get("tenseWord4Future"))) { lastTense = "FUTURE"; Logger.printDetail("this tense:"+lastTense); } } if (token.getCoveredText().equals("since")) { lastTense = "PAST"; Logger.printDetail("this tense:"+lastTense); } if (token.getCoveredText().equals("depuis")) { // French lastTense = "PAST"; Logger.printDetail("this tense:"+lastTense); } } if (lastTense.equals("")) { if (tokEnd > timex.getEnd()) { Token token = tmToken.get(tokEnd); Logger.printDetail("GET NEXT TENSE: string:"+token.getCoveredText()+" pos:"+token.getPos()); Logger.printDetail("hmAllRePattern.containsKey(tensePos4PresentFuture):"+rpm.get("tensePos4PresentFuture")); Logger.printDetail("hmAllRePattern.containsKey(tensePos4Future):"+rpm.get("tensePos4Future")); Logger.printDetail("hmAllRePattern.containsKey(tensePos4Past):"+rpm.get("tensePos4Past")); Logger.printDetail("CHECK TOKEN:"+token.getPos()); if (token.getPos() == null) { } else if ((rpm.containsKey("tensePos4PresentFuture")) && (token.getPos().matches(rpm.get("tensePos4PresentFuture")))) { lastTense = "PRESENTFUTURE"; Logger.printDetail("this tense:"+lastTense); } else if ((rpm.containsKey("tensePos4Past")) && (token.getPos().matches(rpm.get("tensePos4Past")))) { lastTense = "PAST"; Logger.printDetail("this tense:"+lastTense); } else if ((rpm.containsKey("tensePos4Future")) && (token.getPos().matches(rpm.get("tensePos4Future")))) { if (token.getCoveredText().matches(rpm.get("tenseWord4Future"))) { lastTense = "FUTURE"; Logger.printDetail("this tense:"+lastTense); } } } } } // check for double POS Constraints (not included in the rule language, yet) TODO // VHZ VNN and VHZ VNN and VHP VNN and VBP VVN String prevPos = ""; String longTense = ""; if (lastTense.equals("PRESENTFUTURE")) { for (Integer tokEnd : tmToken.keySet()) { if (tokEnd < timex.getBegin()) { Token token = tmToken.get(tokEnd); if ((prevPos.equals("VHZ")) || (prevPos.equals("VBZ")) || (prevPos.equals("VHP")) || (prevPos.equals("VBP")) || (prevPos.equals("VER:pres"))) { if (token.getPos().equals("VVN") || token.getPos().equals("VER:pper")) { if ((!(token.getCoveredText().equals("expected"))) && (!(token.getCoveredText().equals("scheduled")))) { lastTense = "PAST"; longTense = "PAST"; Logger.printDetail("this tense:"+lastTense); } } } prevPos = token.getPos(); } if (longTense.equals("")) { if (tokEnd > timex.getEnd()) { Token token = tmToken.get(tokEnd); if ((prevPos.equals("VHZ")) || (prevPos.equals("VBZ")) || (prevPos.equals("VHP")) || (prevPos.equals("VBP")) || (prevPos.equals("VER:pres"))) { if (token.getPos().equals("VVN") || token.getPos().equals("VER:pper")) { if ((!(token.getCoveredText().equals("expected"))) && (!(token.getCoveredText().equals("scheduled")))) { lastTense = "PAST"; longTense = "PAST"; Logger.printDetail("this tense:"+lastTense); } } } prevPos = token.getPos(); } } } } // French: VER:pres VER:pper if (lastTense.equals("PAST")) { for (Integer tokEnd : tmToken.keySet()) { if (tokEnd < timex.getBegin()) { Token token = tmToken.get(tokEnd); if ((prevPos.equals("VER:pres")) && (token.getPos().equals("VER:pper"))) { if (((token.getCoveredText().matches("^prévue?s?$"))) || ((token.getCoveredText().equals("^envisagée?s?$")))) { lastTense = "FUTURE"; longTense = "FUTURE"; Logger.printDetail("this tense:"+lastTense); } } prevPos = token.getPos(); } if (longTense.equals("")) { if (tokEnd > timex.getEnd()) { Token token = tmToken.get(tokEnd); if ((prevPos.equals("VER:pres")) && (token.getPos().equals("VER:pper"))) { if (((token.getCoveredText().matches("^prévue?s?$"))) || ((token.getCoveredText().equals("^envisagée?s?$")))) { lastTense = "FUTURE"; longTense = "FUTURE"; Logger.printDetail("this tense:"+lastTense); } } prevPos = token.getPos(); } } } } Logger.printDetail("TENSE: "+lastTense); return lastTense; } /** * Check token boundaries of expressions. * @param r MatchResult * @param s Respective sentence * @return whether or not the MatchResult is a clean one */ public static Boolean checkInfrontBehind(MatchResult r, Sentence s) { Boolean ok = true; // get rid of expressions such as "1999" in 53453.1999 if (r.start() > 1) { if ((s.getCoveredText().substring(r.start() - 2, r.start()).matches("\\d\\."))){ ok = false; } } // get rid of expressions if there is a character or symbol ($+) directly in front of the expression if (r.start() > 0) { if (((s.getCoveredText().substring(r.start() - 1, r.start()).matches("[\\w\\$\\+]"))) && (!(s.getCoveredText().substring(r.start() - 1, r.start()).matches("\\(")))){ ok = false; } } if (r.end() < s.getCoveredText().length()) { if ((s.getCoveredText().substring(r.end(), r.end() + 1).matches("[°\\w]")) && (!(s.getCoveredText().substring(r.end(), r.end() + 1).matches("\\)")))){ ok = false; } if (r.end() + 1 < s.getCoveredText().length()) { if (s.getCoveredText().substring(r.end(), r.end() + 2).matches( "[\\.,]\\d")) { ok = false; } } } return ok; } /** * Check token boundaries using token information * @param r MatchResult * @param s respective Sentence * @param jcas current CAS object * @return whether or not the MatchResult is a clean one */ public static Boolean checkTokenBoundaries(MatchResult r, Sentence s, JCas jcas){ Boolean beginOK = false; Boolean endOK = false; // Only check Token boundaries if no white-spaces in front of and behind the match-result if ((r.start() > 0) && ((s.getCoveredText().subSequence(r.start()-1, r.start()).equals(" "))) && ((r.end() < s.getCoveredText().length()) && ((s.getCoveredText().subSequence(r.end(), r.end()+1).equals(" "))))) { return true; } // other token boundaries than white-spaces else { FSIterator iterToken = jcas.getAnnotationIndex(Token.type).subiterator(s); while (iterToken.hasNext()) { Token t = (Token) iterToken.next(); // Check begin if ((r.start() + s.getBegin()) == t.getBegin()){ beginOK = true; } // Tokenizer does not split number from some symbols (".", "/", "-", "–"), // e.g., "...12 August-24 Augsut..." else if ((r.start() > 0) && ((s.getCoveredText().subSequence(r.start()-1, r.start()).equals(".")) || (s.getCoveredText().subSequence(r.start()-1, r.start()).equals("/")) || (s.getCoveredText().subSequence(r.start()-1, r.start()).equals("–")) || (s.getCoveredText().subSequence(r.start()-1, r.start()).equals("-")))) { beginOK = true; } // Check end if ((r.end() + s.getBegin()) == t.getEnd()) { endOK = true; } // Tokenizer does not split number from some symbols (".", "/", "-", "–"), // e.g., "... in 1990. New Sentence ..." else if ((r.end() < s.getCoveredText().length()) && ((s.getCoveredText().subSequence(r.end(), r.end()+1).equals(".")) || (s.getCoveredText().subSequence(r.end(), r.end()+1).equals("/")) || (s.getCoveredText().subSequence(r.end(), r.end()+1).equals("–")) || (s.getCoveredText().subSequence(r.end(), r.end()+1).equals("-")))) { endOK = true; } if (beginOK && endOK) return true; } } return false; } }