/* * HeidelTime.java * * Copyright (c) 2011, Database Research Group, Institute of Computer Science, Heidelberg University. * All rights reserved. This program and the accompanying materials * are made available under the terms of the GNU General Public License. * * author: Jannik Strötgen * email: stroetgen@uni-hd.de * * HeidelTime is a multilingual, cross-domain temporal tagger. * For details, see http://dbs.ifi.uni-heidelberg.de/heideltime */ package de.unihd.dbs.uima.annotator.heideltime; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.regex.MatchResult; import java.util.regex.Pattern; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.cas.FSIterator; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.unihd.dbs.uima.annotator.heideltime.ProcessorManager.Priority; import de.unihd.dbs.uima.annotator.heideltime.resources.Language; import de.unihd.dbs.uima.annotator.heideltime.resources.NormalizationManager; import de.unihd.dbs.uima.annotator.heideltime.resources.RePatternManager; import de.unihd.dbs.uima.annotator.heideltime.resources.RuleManager; import de.unihd.dbs.uima.annotator.heideltime.utilities.DateCalculator; import de.unihd.dbs.uima.annotator.heideltime.utilities.ContextAnalyzer; import de.unihd.dbs.uima.annotator.heideltime.utilities.LocaleException; import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; import de.unihd.dbs.uima.annotator.heideltime.utilities.Toolbox; import de.unihd.dbs.uima.types.heideltime.Dct; import de.unihd.dbs.uima.types.heideltime.Sentence; import de.unihd.dbs.uima.types.heideltime.Timex3; import de.unihd.dbs.uima.types.heideltime.Token; /** * HeidelTime finds temporal expressions and normalizes them according to the TIMEX3 * TimeML annotation standard. * * @author jannik stroetgen * */ public class HeidelTime extends JCasAnnotator_ImplBase { // TOOL NAME (may be used as componentId) private Class<?> component = this.getClass(); // PROCESSOR MANAGER ProcessorManager procMan = ProcessorManager.getInstance(); // COUNTER (how many timexes added to CAS? (finally) public int timex_counter = 0; public int timex_counter_global = 0; // COUNTER FOR TIMEX IDS private int timexID = 0; // INPUT PARAMETER HANDLING WITH UIMA private String PARAM_LANGUAGE = "Language"; // supported languages (2012-05-19): english, german, dutch, englishcoll, englishsci private String PARAM_TYPE_TO_PROCESS = "Type"; // chosen locale parameter name private String PARAM_LOCALE = "locale"; // supported types (2012-05-19): news (english, german, dutch), narrative (english, german, dutch), colloquial private Language language = Language.ENGLISH; private String typeToProcess = "news"; // INPUT PARAMETER HANDLING WITH UIMA (which types shall be extracted) private String PARAM_DATE = "Date"; private String PARAM_TIME = "Time"; private String PARAM_DURATION = "Duration"; private String PARAM_SET = "Set"; private String PARAM_DEBUG = "Debugging"; private Boolean find_dates = true; private Boolean find_times = true; private Boolean find_durations = true; private Boolean find_sets = true; // FOR DEBUGGING PURPOSES (IF FALSE) private Boolean deleteOverlapped = true; /** * @see AnalysisComponent#initialize(UimaContext) */ public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); ///////////////////////////////// // DEBUGGING PARAMETER SETTING // ///////////////////////////////// this.deleteOverlapped = true; Boolean doDebug = (Boolean) aContext.getConfigParameterValue(PARAM_DEBUG); Logger.setPrintDetails(doDebug == null ? false : doDebug); ///////////////////////////////// // HANDLE LOCALE // ///////////////////////////////// String requestedLocale = (String) aContext.getConfigParameterValue(PARAM_LOCALE); if(requestedLocale == null || requestedLocale.length() == 0) { // if the PARAM_LOCALE setting was left empty, Locale.setDefault(Locale.UK); // use a default, the ISO8601-adhering UK locale (equivalent to "en_GB") } else { // otherwise, check if the desired locale exists in the JVM's available locale repertoire try { Locale locale = DateCalculator.getLocaleFromString(requestedLocale); Locale.setDefault(locale); // sets it for the entire JVM session } catch (LocaleException e) { Logger.printError("Supplied locale parameter couldn't be resolved to a working locale. Try one of these:"); String localesString = new String(); for(Locale l : Locale.getAvailableLocales()) { // list all available locales localesString += l.toString()+" "; } Logger.printError(localesString); System.exit(-1); } } ////////////////////////////////// // GET CONFIGURATION PARAMETERS // ////////////////////////////////// language = Language.getLanguageFromString((String) aContext.getConfigParameterValue(PARAM_LANGUAGE)); typeToProcess = (String) aContext.getConfigParameterValue(PARAM_TYPE_TO_PROCESS); find_dates = (Boolean) aContext.getConfigParameterValue(PARAM_DATE); find_times = (Boolean) aContext.getConfigParameterValue(PARAM_TIME); find_durations = (Boolean) aContext.getConfigParameterValue(PARAM_DURATION); find_sets = (Boolean) aContext.getConfigParameterValue(PARAM_SET); //////////////////////////////////////////////////////////// // READ NORMALIZATION RESOURCES FROM FILES AND STORE THEM // //////////////////////////////////////////////////////////// NormalizationManager.getInstance(language); ////////////////////////////////////////////////////// // READ PATTERN RESOURCES FROM FILES AND STORE THEM // ////////////////////////////////////////////////////// RePatternManager.getInstance(language); /////////////////////////////////////////////////// // READ RULE RESOURCES FROM FILES AND STORE THEM // /////////////////////////////////////////////////// RuleManager.getInstance(language); ///////////////////////////////////////////////////////////////////////////////// // SUBPROCESSOR CONFIGURATION. REGISTER YOUR OWN PROCESSORS HERE FOR EXECUTION // ///////////////////////////////////////////////////////////////////////////////// procMan.registerProcessor("de.unihd.dbs.uima.annotator.heideltime.processors.HolidayProcessor"); procMan.initializeAllProcessors(aContext); ///////////////////////////// // PRINT WHAT WILL BE DONE // ///////////////////////////// if (find_dates) Logger.printDetail("Getting Dates..."); if (find_times) Logger.printDetail("Getting Times..."); if (find_durations) Logger.printDetail("Getting Durations..."); if (find_sets) Logger.printDetail("Getting Sets..."); } /** * @see JCasAnnotator_ImplBase#process(JCas) */ public void process(JCas jcas) { // run preprocessing processors procMan.executeProcessors(jcas, Priority.PREPROCESSING); RuleManager rulem = RuleManager.getInstance(language); timexID = 1; // reset counter once per document processing timex_counter = 0; //////////////////////////////////////////// // CHECK SENTENCE BY SENTENCE FOR TIMEXES // //////////////////////////////////////////// FSIterator sentIter = jcas.getAnnotationIndex(Sentence.type).iterator(); /* * check if the pipeline has annotated any sentences. if not, heideltime can't do any work, * will return from process() with a warning message. */ if(!sentIter.hasNext()) { Logger.printError(component, "HeidelTime has not found any sentence tokens in this document. " + "HeidelTime needs sentence tokens tagged by a preprocessing UIMA analysis engine to " + "do its work. Please check your UIMA workflow and add an analysis engine that creates " + "these sentence tokens."); } while (sentIter.hasNext()) { Sentence s = (Sentence) sentIter.next(); Boolean debugIteration = false; Boolean oldDebugState = Logger.getPrintDetails(); do { try { if (find_dates) { findTimexes("DATE", rulem.getHmDatePattern(), rulem.getHmDateOffset(), rulem.getHmDateNormalization(), rulem.getHmDateQuant(), s, jcas); } if (find_times) { findTimexes("TIME", rulem.getHmTimePattern(), rulem.getHmTimeOffset(), rulem.getHmTimeNormalization(), rulem.getHmTimeQuant(), s, jcas); } if (find_sets) { findTimexes("SET", rulem.getHmSetPattern(), rulem.getHmSetOffset(), rulem.getHmSetNormalization(), rulem.getHmSetQuant(), s, jcas); } if (find_durations) { findTimexes("DURATION", rulem.getHmDurationPattern(), rulem.getHmDurationOffset(), rulem.getHmDurationNormalization(), rulem.getHmDurationQuant(), s, jcas); } } catch(NullPointerException npe) { if(!debugIteration) { debugIteration = true; Logger.setPrintDetails(true); Logger.printError(component, "HeidelTime's execution has been interrupted by an exception that " + "is likely rooted in faulty normalization resource files. Please consider opening an issue " + "report containing the following information at our Google Code project issue tracker: " + "https://code.google.com/p/heideltime. Thanks!"); npe.printStackTrace(); Logger.printError(component, "Sentence [" + s.getBegin() + "-" + s.getEnd() + "]: " + s.getCoveredText()); Logger.printError(component, "Language: " + language); Logger.printError(component, "Re-running this sentence with DEBUGGING enabled..."); } else { debugIteration = false; Logger.setPrintDetails(oldDebugState); Logger.printError(component, "Execution will now resume."); } } } while(debugIteration); } /* * get longest Timex expressions only (if needed) */ if (deleteOverlapped == true) // could be modified to: get longest TIMEX expressions of one type, only ??? deleteOverlappedTimexes(jcas); /* * specify ambiguous values, e.g.: specific year for date values of * format UNDEF-year-01-01; specific month for values of format UNDEF-last-month */ specifyAmbiguousValues(jcas); // run arbitrary processors procMan.executeProcessors(jcas, Priority.ARBITRARY); // remove invalid timexes removeInvalids(jcas); // run postprocessing processors procMan.executeProcessors(jcas, Priority.POSTPROCESSING); timex_counter_global = timex_counter_global + timex_counter; Logger.printDetail(component, "Number of Timexes added to CAS: "+timex_counter + "(global: "+timex_counter_global+")"); } /** * Add timex annotation to CAS object. * * @param timexType * @param begin * @param end * @param timexValue * @param timexId * @param foundByRule * @param jcas */ public void addTimexAnnotation(String timexType, int begin, int end, Sentence sentence, String timexValue, String timexQuant, String timexFreq, String timexMod, String timexId, String foundByRule, JCas jcas) { Timex3 annotation = new Timex3(jcas); annotation.setBegin(begin); annotation.setEnd(end); annotation.setFilename(sentence.getFilename()); annotation.setSentId(sentence.getSentenceId()); FSIterator iterToken = jcas.getAnnotationIndex(Token.type).subiterator(sentence); String allTokIds = ""; while (iterToken.hasNext()) { Token tok = (Token) iterToken.next(); if (tok.getBegin() == begin) { annotation.setFirstTokId(tok.getTokenId()); allTokIds = "BEGIN<-->" + tok.getTokenId(); } if ((tok.getBegin() > begin) && (tok.getEnd() <= end)) { allTokIds = allTokIds + "<-->" + tok.getTokenId(); } } annotation.setAllTokIds(allTokIds); annotation.setTimexType(timexType); annotation.setTimexValue(timexValue); annotation.setTimexId(timexId); annotation.setFoundByRule(foundByRule); if ((timexType.equals("DATE")) || (timexType.equals("TIME"))) { if ((timexValue.startsWith("X")) || (timexValue.startsWith("UNDEF"))) { annotation.setFoundByRule(foundByRule+"-relative"); } else { annotation.setFoundByRule(foundByRule+"-explicit"); } } if (!(timexQuant == null)) { annotation.setTimexQuant(timexQuant); } if (!(timexFreq == null)) { annotation.setTimexFreq(timexFreq); } if (!(timexMod == null)) { annotation.setTimexMod(timexMod); } annotation.addToIndexes(); this.timex_counter++; Logger.printDetail(annotation.getTimexId()+"EXTRACTION PHASE: "+" found by:"+annotation.getFoundByRule()+" text:"+annotation.getCoveredText()); Logger.printDetail(annotation.getTimexId()+"NORMALIZATION PHASE:"+" found by:"+annotation.getFoundByRule()+" text:"+annotation.getCoveredText()+" value:"+annotation.getTimexValue()); } /** * Postprocessing: Remove invalid timex expressions. These are already * marked as invalid: timexValue().equals("REMOVE") * * @param jcas */ public void removeInvalids(JCas jcas) { /* * Iterate over timexes and add invalids to HashSet * (invalids cannot be removed directly since iterator is used) */ FSIterator iterTimex = jcas.getAnnotationIndex(Timex3.type).iterator(); HashSet<Timex3> hsTimexToRemove = new HashSet<Timex3>(); while (iterTimex.hasNext()) { Timex3 timex = (Timex3) iterTimex.next(); if (timex.getTimexValue().equals("REMOVE")) { hsTimexToRemove.add(timex); } } // remove invalids, finally for (Timex3 timex3 : hsTimexToRemove) { timex3.removeFromIndexes(); this.timex_counter--; Logger.printDetail(timex3.getTimexId()+" REMOVING PHASE: "+"found by:"+timex3.getFoundByRule()+" text:"+timex3.getCoveredText()+" value:"+timex3.getTimexValue()); } } /** * Under-specified values are disambiguated here. Only Timexes of types "date" and "time" can be under-specified. * @param jcas */ @SuppressWarnings("unused") public void specifyAmbiguousValues(JCas jcas) { NormalizationManager norm = NormalizationManager.getInstance(language); // build up a list with all found TIMEX expressions List<Timex3> linearDates = new ArrayList<Timex3>(); FSIterator iterTimex = jcas.getAnnotationIndex(Timex3.type).iterator(); // Create List of all Timexes of types "date" and "time" while (iterTimex.hasNext()) { Timex3 timex = (Timex3) iterTimex.next(); if (timex.getTimexType().equals("DATE") || timex.getTimexType().equals("TIME")) { linearDates.add(timex); } } //////////////////////////////////////// // IS THERE A DOCUMENT CREATION TIME? // //////////////////////////////////////// boolean dctAvailable = false; ////////////////////////////// // DOCUMENT TYPE TO PROCESS // //////////////////////////// boolean documentTypeNews = false; boolean documentTypeNarrative = false; boolean documentTypeColloquial = false; boolean documentTypeScientific = false; if (typeToProcess.equals("news")) { documentTypeNews = true; } if (typeToProcess.equals("narrative")) { documentTypeNarrative = true; } if (typeToProcess.equals("colloquial")) { documentTypeColloquial = true; } if (typeToProcess.equals("scientific")) { documentTypeScientific = true; } // get the dct information String dctValue = ""; int dctCentury = 0; int dctYear = 0; int dctDecade = 0; int dctMonth = 0; int dctDay = 0; String dctSeason = ""; String dctQuarter = ""; String dctHalf = ""; int dctWeekday = 0; int dctWeek = 0; ////////////////////////////////////////////// // INFORMATION ABOUT DOCUMENT CREATION TIME // ////////////////////////////////////////////// FSIterator dctIter = jcas.getAnnotationIndex(Dct.type).iterator(); if (dctIter.hasNext()) { dctAvailable = true; Dct dct = (Dct) dctIter.next(); dctValue = dct.getValue(); // year, month, day as mentioned in the DCT if (dctValue.matches("\\d\\d\\d\\d\\d\\d\\d\\d")) { dctCentury = Integer.parseInt(dctValue.substring(0, 2)); dctYear = Integer.parseInt(dctValue.substring(0, 4)); dctDecade = Integer.parseInt(dctValue.substring(2, 3)); dctMonth = Integer.parseInt(dctValue.substring(4, 6)); dctDay = Integer.parseInt(dctValue.substring(6, 8)); Logger.printDetail("dctCentury:"+dctCentury); Logger.printDetail("dctYear:"+dctYear); Logger.printDetail("dctDecade:"+dctDecade); Logger.printDetail("dctMonth:"+dctMonth); Logger.printDetail("dctDay:"+dctDay); } else { dctCentury = Integer.parseInt(dctValue.substring(0, 2)); dctYear = Integer.parseInt(dctValue.substring(0, 4)); dctDecade = Integer.parseInt(dctValue.substring(2, 3)); dctMonth = Integer.parseInt(dctValue.substring(5, 7)); dctDay = Integer.parseInt(dctValue.substring(8, 10)); Logger.printDetail("dctCentury:"+dctCentury); Logger.printDetail("dctYear:"+dctYear); Logger.printDetail("dctDecade:"+dctDecade); Logger.printDetail("dctMonth:"+dctMonth); Logger.printDetail("dctDay:"+dctDay); } dctQuarter = "Q"+norm.getFromNormMonthInQuarter(norm.getFromNormNumber(dctMonth+"")); dctHalf = "H1"; if (dctMonth > 6) { dctHalf = "H2"; } // season, week, weekday, have to be calculated dctSeason = norm.getFromNormMonthInSeason(norm.getFromNormNumber(dctMonth+"")+""); dctWeekday = DateCalculator.getWeekdayOfDate(dctYear+"-"+norm.getFromNormNumber(dctMonth+"")+"-"+norm.getFromNormNumber(dctDay+"")); dctWeek = DateCalculator.getWeekOfDate(dctYear+"-"+norm.getFromNormNumber(dctMonth+"") +"-"+norm.getFromNormNumber(dctDay+"")); Logger.printDetail("dctQuarter:"+dctQuarter); Logger.printDetail("dctSeason:"+dctSeason); Logger.printDetail("dctWeekday:"+dctWeekday); Logger.printDetail("dctWeek:"+dctWeek); } else { Logger.printDetail("No DCT available..."); } ////////////////////////////////////////////// // go through list of Date and Time timexes // ////////////////////////////////////////////// for (int i = 0; i < linearDates.size(); i++) { Timex3 t_i = (Timex3) linearDates.get(i); String value_i = t_i.getTimexValue(); // check if value_i has month, day, season, week (otherwise no UNDEF-year is possible) Boolean viHasMonth = false; Boolean viHasDay = false; Boolean viHasSeason = false; Boolean viHasWeek = false; Boolean viHasQuarter = false; Boolean viHasHalf = false; int viThisMonth = 0; int viThisDay = 0; String viThisSeason = ""; String viThisQuarter = ""; String viThisHalf = ""; String[] valueParts = value_i.split("-"); // check if UNDEF-year or UNDEF-century if ((value_i.startsWith("UNDEF-year")) || (value_i.startsWith("UNDEF-century"))) { if (valueParts.length > 2) { // get vi month if (valueParts[2].matches("\\d\\d")) { viHasMonth = true; viThisMonth = Integer.parseInt(valueParts[2]); } // get vi season else if ((valueParts[2].equals("SP")) || (valueParts[2].equals("SU")) || (valueParts[2].equals("FA")) || (valueParts[2].equals("WI"))) { viHasSeason = true; viThisSeason = valueParts[2]; } // get v1 quarter else if ((valueParts[2].equals("Q1")) || (valueParts[2].equals("Q2")) || (valueParts[2].equals("Q3")) || (valueParts[2].equals("Q4"))) { viHasQuarter = true; viThisQuarter = valueParts[2]; } else if ((valueParts[2].equals("H1")) || (valueParts[2].equals("H2"))) { viHasHalf = true; viThisHalf = valueParts[2]; } // get vi day if ((valueParts.length > 3) && (valueParts[3].matches("\\d\\d"))) { viHasDay = true; viThisDay = Integer.parseInt(valueParts[3]); } } } else { if (valueParts.length > 1) { // get vi month if (valueParts[1].matches("\\d\\d")) { viHasMonth = true; viThisMonth = Integer.parseInt(valueParts[1]); } // get vi season else if ((valueParts[1].equals("SP")) || (valueParts[1].equals("SU")) || (valueParts[1].equals("FA")) || (valueParts[1].equals("WI"))) { viHasSeason = true; viThisSeason = valueParts[1]; } // get vi day if ((valueParts.length > 2) && (valueParts[2].matches("\\d\\d"))) { viHasDay = true; viThisDay = Integer.parseInt(valueParts[2]); } } } // get the last tense (depending on the part of speech tags used in front or behind the expression) String last_used_tense = ContextAnalyzer.getLastTense(t_i, jcas, language); ////////////////////////// // DISAMBIGUATION PHASE // ////////////////////////// //////////////////////////////////////////////////// // IF YEAR IS COMPLETELY UNSPECIFIED (UNDEF-year) // //////////////////////////////////////////////////// String valueNew = value_i; if (value_i.startsWith("UNDEF-year")) { String newYearValue = dctYear+""; // vi has month (ignore day) if (viHasMonth == true && (viHasSeason == false)) { // WITH DOCUMENT CREATION TIME if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { // Tense is FUTURE if ((last_used_tense.equals("FUTURE")) || (last_used_tense.equals("PRESENTFUTURE"))) { // if dct-month is larger than vi-month, than add 1 to dct-year if (dctMonth > viThisMonth) { int intNewYear = dctYear + 1; newYearValue = intNewYear + ""; } } // Tense is PAST if ((last_used_tense.equals("PAST"))) { // if dct-month is smaller than vi month, than substrate 1 from dct-year if (dctMonth < viThisMonth) { int intNewYear = dctYear - 1; newYearValue = intNewYear + ""; } } } // WITHOUT DOCUMENT CREATION TIME else { newYearValue = ContextAnalyzer.getLastMentionedX(linearDates, i, "year", language); } } // vi has quaurter if (viHasQuarter == true) { // WITH DOCUMENT CREATION TIME if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { // Tense is FUTURE if ((last_used_tense.equals("FUTURE")) || (last_used_tense.equals("PRESENTFUTURE"))) { if (Integer.parseInt(dctQuarter.substring(1)) < Integer.parseInt(viThisQuarter.substring(1))) { int intNewYear = dctYear + 1; newYearValue = intNewYear + ""; } } // Tense is PAST if ((last_used_tense.equals("PAST"))) { if (Integer.parseInt(dctQuarter.substring(1)) < Integer.parseInt(viThisQuarter.substring(1))) { int intNewYear = dctYear - 1; newYearValue = intNewYear + ""; } } // IF NO TENSE IS FOUND if (last_used_tense.equals("")){ if (documentTypeColloquial){ // IN COLLOQUIAL: future temporal expressions if (Integer.parseInt(dctQuarter.substring(1)) < Integer.parseInt(viThisQuarter.substring(1))){ int intNewYear = dctYear + 1; newYearValue = intNewYear + ""; } } else{ // IN NEWS: past temporal expressions if (Integer.parseInt(dctQuarter.substring(1)) < Integer.parseInt(viThisQuarter.substring(1))){ int intNewYear = dctYear - 1; newYearValue = intNewYear + ""; } } } } // WITHOUT DOCUMENT CREATION TIME else { newYearValue = ContextAnalyzer.getLastMentionedX(linearDates, i, "year", language); } } // vi has half if (viHasHalf == true) { // WITH DOCUMENT CREATION TIME if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { // Tense is FUTURE if ((last_used_tense.equals("FUTURE")) || (last_used_tense.equals("PRESENTFUTURE"))) { if (Integer.parseInt(dctHalf.substring(1)) < Integer.parseInt(viThisHalf.substring(1))) { int intNewYear = dctYear + 1; newYearValue = intNewYear + ""; } } // Tense is PAST if ((last_used_tense.equals("PAST"))) { if (Integer.parseInt(dctHalf.substring(1)) < Integer.parseInt(viThisHalf.substring(1))) { int intNewYear = dctYear - 1; newYearValue = intNewYear + ""; } } // IF NO TENSE IS FOUND if (last_used_tense.equals("")){ if (documentTypeColloquial){ // IN COLLOQUIAL: future temporal expressions if (Integer.parseInt(dctHalf.substring(1)) < Integer.parseInt(viThisHalf.substring(1))){ int intNewYear = dctYear + 1; newYearValue = intNewYear + ""; } } else{ // IN NEWS: past temporal expressions if (Integer.parseInt(dctHalf.substring(1)) < Integer.parseInt(viThisHalf.substring(1))){ int intNewYear = dctYear - 1; newYearValue = intNewYear + ""; } } } } // WITHOUT DOCUMENT CREATION TIME else { newYearValue = ContextAnalyzer.getLastMentionedX(linearDates, i, "year", language); } } // vi has season if ((viHasMonth == false) && (viHasDay == false) && (viHasSeason == true)) { // TODO check tenses? // WITH DOCUMENT CREATION TIME if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { newYearValue = dctYear+""; } // WITHOUT DOCUMENT CREATION TIME else { newYearValue = ContextAnalyzer.getLastMentionedX(linearDates, i, "year", language); } } // vi has week if (viHasWeek) { // WITH DOCUMENT CREATION TIME if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { newYearValue = dctYear+""; } // WITHOUT DOCUMENT CREATION TIME else { newYearValue = ContextAnalyzer.getLastMentionedX(linearDates, i, "year", language); } } // REPLACE THE UNDEF-YEAR WITH THE NEWLY CALCULATED YEAR AND ADD TIMEX TO INDEXES if (newYearValue.equals("")) { valueNew = value_i.replaceFirst("UNDEF-year", "XXXX"); } else { valueNew = value_i.replaceFirst("UNDEF-year", newYearValue); } } /////////////////////////////////////////////////// // just century is unspecified (UNDEF-century86) // /////////////////////////////////////////////////// else if ((value_i.startsWith("UNDEF-century"))) { String newCenturyValue = dctCentury+""; // NEWS and COLLOQUIAL DOCUMENTS if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && !value_i.equals("UNDEF-century")) { int viThisDecade = Integer.parseInt(value_i.substring(13, 14)); Logger.printDetail("dctCentury"+dctCentury); newCenturyValue = dctCentury+""; Logger.printDetail("dctCentury"+dctCentury); // Tense is FUTURE if ((last_used_tense.equals("FUTURE")) || (last_used_tense.equals("PRESENTFUTURE"))) { if (viThisDecade < dctDecade) { newCenturyValue = dctCentury + 1+""; } else { newCenturyValue = dctCentury+""; } } // Tense is PAST if ((last_used_tense.equals("PAST"))) { if (dctDecade < viThisDecade) { newCenturyValue = dctCentury - 1+""; } else { newCenturyValue = dctCentury+""; } } } // NARRATIVE DOCUMENTS else { newCenturyValue = ContextAnalyzer.getLastMentionedX(linearDates, i, "century", language); } if (newCenturyValue.equals("")) { // always assume that sixties, twenties, and so on are 19XX (changed 2011-09-08) valueNew = value_i.replaceFirst("UNDEF-century", "19"); } else { valueNew = value_i.replaceFirst("UNDEF-century", newCenturyValue+""); } // always assume that sixties, twenties, and so on are 19XX (changed 2011-09-08) if (valueNew.matches("\\d\\d\\d")) { valueNew = "19" + valueNew.substring(2); } } //////////////////////////////////////////////////// // CHECK IMPLICIT EXPRESSIONS STARTING WITH UNDEF // //////////////////////////////////////////////////// else if (value_i.startsWith("UNDEF")) { valueNew = value_i; ////////////////// // TO CALCULATE // ////////////////// // year to calculate if (value_i.matches("^UNDEF-(this|REFUNIT|REF)-(.*?)-(MINUS|PLUS)-([0-9]+).*")) { for (MatchResult mr : Toolbox.findMatches(Pattern.compile("^(UNDEF-(this|REFUNIT|REF)-(.*?)-(MINUS|PLUS)-([0-9]+)).*"), value_i)) { String checkUndef = mr.group(1); String ltn = mr.group(2); String unit = mr.group(3); String op = mr.group(4); int diff = Integer.parseInt(mr.group(5)); // do the processing for SCIENTIFIC documents (TPZ identification could be improved) if ((documentTypeScientific)){ String opSymbol = "-"; if (op.equals("PLUS")){ opSymbol = "+"; } if (unit.equals("year")){ String diffString = diff+""; if (diff < 10){ diffString = "000"+diff; } else if (diff < 100){ diffString = "00"+diff; } else if (diff < 1000){ diffString = "0"+diff; } valueNew = "TPZ"+opSymbol+diffString; } else if (unit.equals("month")){ String diffString = diff+""; if (diff < 10){ diffString = "0000-0"+diff; } else { diffString = "0000-"+diff; } valueNew = "TPZ"+opSymbol+diffString; } else if (unit.equals("week")){ String diffString = diff+""; if (diff < 10){ diffString = "0000-W0"+diff; } else { diffString = "0000-W"+diff; } valueNew = "TPZ"+opSymbol+diffString; } else if (unit.equals("day")){ String diffString = diff+""; if (diff < 10){ diffString = "0000-00-0"+diff; } else { diffString = "0000-00-"+diff; } valueNew = "TPZ"+opSymbol+diffString; } else if (unit.equals("hour")){ String diffString = diff+""; if (diff < 10){ diffString = "0000-00-00T0"+diff; } else { diffString = "0000-00-00T"+diff; } valueNew = "TPZ"+opSymbol+diffString; } else if (unit.equals("minute")){ String diffString = diff+""; if (diff < 10){ diffString = "0000-00-00T00:0"+diff; } else { diffString = "0000-00-00T00:"+diff; } valueNew = "TPZ"+opSymbol+diffString; } else if (unit.equals("second")){ String diffString = diff+""; if (diff < 10){ diffString = "0000-00-00T00:00:0"+diff; } else { diffString = "0000-00-00T00:00:"+diff; } valueNew = "TPZ"+opSymbol+diffString; } } else{ // check for REFUNIT (only allowed for "year") if ((ltn.equals("REFUNIT")) && (unit.equals("year"))) { String dateWithYear = ContextAnalyzer.getLastMentionedX(linearDates, i, "dateYear", language); if (dateWithYear.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX"); } else { if (op.equals("MINUS")) { diff = diff * (-1); } int yearNew = Integer.parseInt(dateWithYear.substring(0,4)) + diff; String rest = dateWithYear.substring(4); valueNew = valueNew.replace(checkUndef, yearNew+rest); } } // REF and this are handled here if (unit.equals("century")) { if ((documentTypeNews|documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) { int century = dctCentury; if (op.equals("MINUS")) { century = dctCentury - diff; } else if (op.equals("PLUS")) { century = dctCentury + diff; } valueNew = valueNew.replace(checkUndef, century+""); } else { String lmCentury = ContextAnalyzer.getLastMentionedX(linearDates, i, "century", language); if (lmCentury.equals("")) { valueNew = valueNew.replace(checkUndef, ""); } else { if (op.equals("MINUS")) { lmCentury = Integer.parseInt(lmCentury) - diff + ""; } else if (op.equals("PLUS")) { lmCentury = Integer.parseInt(lmCentury) + diff + ""; } valueNew = valueNew.replace(checkUndef, lmCentury); } } } else if (unit.equals("decade")) { if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) { int dctDecadeLong = Integer.parseInt(dctCentury + "" + dctDecade); int decade = dctDecadeLong; if (op.equals("MINUS")) { decade = dctDecadeLong - diff; } else if (op.equals("PLUS")) { decade = dctDecadeLong + diff; } valueNew = valueNew.replace(checkUndef, decade+"X"); } else { String lmDecade = ContextAnalyzer.getLastMentionedX(linearDates, i, "decade", language); if (lmDecade.equals("")) { valueNew = valueNew.replace(checkUndef, "XXX"); } else { if (op.equals("MINUS")) { lmDecade = Integer.parseInt(lmDecade) - diff + ""; } else if (op.equals("PLUS")) { lmDecade = Integer.parseInt(lmDecade) + diff + ""; } valueNew = valueNew.replace(checkUndef, lmDecade); } } } else if (unit.equals("year")) { if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) { int intValue = dctYear; if (op.equals("MINUS")) { intValue = dctYear - diff; } else if (op.equals("PLUS")) { intValue = dctYear + diff; } valueNew = valueNew.replace(checkUndef, intValue + ""); } else { String lmYear = ContextAnalyzer.getLastMentionedX(linearDates, i, "year", language); if (lmYear.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX"); } else { int intValue = Integer.parseInt(lmYear); if (op.equals("MINUS")) { intValue = Integer.parseInt(lmYear) - diff; } else if (op.equals("PLUS")) { intValue = Integer.parseInt(lmYear) + diff; } valueNew = valueNew.replace(checkUndef, intValue+""); } } } else if (unit.equals("quarter")) { if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) { int intYear = dctYear; int intQuarter = Integer.parseInt(dctQuarter.substring(1)); int diffQuarters = diff % 4; diff = diff - diffQuarters; int diffYears = diff / 4; if (op.equals("MINUS")) { diffQuarters = diffQuarters * (-1); diffYears = diffYears * (-1); } intYear = intYear + diffYears; intQuarter = intQuarter + diffQuarters; valueNew = valueNew.replace(checkUndef, intYear+"-Q"+intQuarter); } else { String lmQuarter = ContextAnalyzer.getLastMentionedX(linearDates, i, "quarter", language); if (lmQuarter.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX"); } else { int intYear = Integer.parseInt(lmQuarter.substring(0, 4)); int intQuarter = Integer.parseInt(lmQuarter.substring(6)); int diffQuarters = diff % 4; diff = diff - diffQuarters; int diffYears = diff / 4; if (op.equals("MINUS")) { diffQuarters = diffQuarters * (-1); diffYears = diffYears * (-1); } intYear = intYear + diffYears; intQuarter = intQuarter + diffQuarters; valueNew = valueNew.replace(checkUndef, intYear+"-Q"+intQuarter); } } } else if (unit.equals("month")) { if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) { if (op.equals("MINUS")) { diff = diff * (-1); } valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextMonth(dctYear + "-" + norm.getFromNormNumber(dctMonth+""), diff)); } else { String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates, i, "month", language); if (lmMonth.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX"); } else { if (op.equals("MINUS")) { diff = diff * (-1); } valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextMonth(lmMonth, diff)); } } } else if (unit.equals("week")) { if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) { if (op.equals("MINUS")) { diff = diff * (-1); } else if (op.equals("PLUS")) { // diff = diff * 7; } valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextWeek(dctYear+"-W"+norm.getFromNormNumber(dctWeek+""), diff, language)); } else { String lmDay = ContextAnalyzer.getLastMentionedX(linearDates, i, "day", language); if (lmDay.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX"); } else { if (op.equals("MINUS")) { diff = diff * 7 * (-1); } else if (op.equals("PLUS")) { diff = diff * 7; } valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay, diff)); } } } else if (unit.equals("day")) { if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) { if (op.equals("MINUS")) { diff = diff * (-1); } valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + norm.getFromNormNumber(dctMonth+"") + "-" + dctDay, diff)); } else { String lmDay = ContextAnalyzer.getLastMentionedX(linearDates, i, "day", language); if (lmDay.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX"); } else { if (op.equals("MINUS")) { diff = diff * (-1); } valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay, diff)); } } } } } } // century else if (value_i.startsWith("UNDEF-last-century")) { String checkUndef = "UNDEF-last-century"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef, norm.getFromNormNumber(dctCentury - 1 +"")); } else { String lmCentury = ContextAnalyzer.getLastMentionedX(linearDates,i,"century", language); if (lmCentury.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX"); } else { valueNew = valueNew.replace(checkUndef, norm.getFromNormNumber(Integer.parseInt(lmCentury) - 1 +"")); } } } else if (value_i.startsWith("UNDEF-this-century")) { String checkUndef = "UNDEF-this-century"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef, norm.getFromNormNumber(dctCentury+"")); } else { String lmCentury = ContextAnalyzer.getLastMentionedX(linearDates,i,"century", language); if (lmCentury.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX"); } else { valueNew = valueNew.replace(checkUndef, norm.getFromNormNumber(Integer.parseInt(lmCentury)+"")); } } } else if (value_i.startsWith("UNDEF-next-century")) { String checkUndef = "UNDEF-next-century"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef, norm.getFromNormNumber(dctCentury + 1+"")); } else { String lmCentury = ContextAnalyzer.getLastMentionedX(linearDates,i,"century", language); if (lmCentury.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX"); } else { valueNew = valueNew.replace(checkUndef, norm.getFromNormNumber(Integer.parseInt(lmCentury) + 1+"")); } } } // decade else if (value_i.startsWith("UNDEF-last-decade")) { String checkUndef = "UNDEF-last-decade"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef, (dctYear - 10+"").substring(0,3)); } else { String lmDecade = ContextAnalyzer.getLastMentionedX(linearDates,i,"decade", language); if (lmDecade.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX"); } else { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmDecade)-1+""); } } } else if (value_i.startsWith("UNDEF-this-decade")) { String checkUndef = "UNDEF-this-decade"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef, (dctYear+"").substring(0,3)); } else { String lmDecade = ContextAnalyzer.getLastMentionedX(linearDates,i,"decade", language); if (lmDecade.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX"); } else { valueNew = valueNew.replace(checkUndef, lmDecade); } } } else if (value_i.startsWith("UNDEF-next-decade")) { String checkUndef = "UNDEF-next-decade"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef, (dctYear + 10+"").substring(0,3)); } else { String lmDecade = ContextAnalyzer.getLastMentionedX(linearDates,i,"decade", language); if (lmDecade.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX"); } else { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmDecade)+1+""); } } } // year else if (value_i.startsWith("UNDEF-last-year")) { String checkUndef = "UNDEF-last-year"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef, dctYear -1 +""); } else { String lmYear = ContextAnalyzer.getLastMentionedX(linearDates,i,"year", language); if (lmYear.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX"); } else { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmYear)-1+""); } } } else if (value_i.startsWith("UNDEF-this-year")) { String checkUndef = "UNDEF-this-year"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef, dctYear +""); } else { String lmYear = ContextAnalyzer.getLastMentionedX(linearDates,i,"year", language); if (lmYear.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX"); } else { valueNew = valueNew.replace(checkUndef, lmYear); } } } else if (value_i.startsWith("UNDEF-next-year")) { String checkUndef = "UNDEF-next-year"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef, dctYear +1 +""); } else { String lmYear = ContextAnalyzer.getLastMentionedX(linearDates,i,"year", language); if (lmYear.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX"); } else { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmYear)+1+""); } } } // month else if (value_i.startsWith("UNDEF-last-month")) { String checkUndef = "UNDEF-last-month"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextMonth(dctYear + "-" + norm.getFromNormNumber(dctMonth+""), -1)); } else { String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates,i,"month", language); if (lmMonth.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX"); } else { valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextMonth(lmMonth, -1)); } } } else if (value_i.startsWith("UNDEF-this-month")) { String checkUndef = "UNDEF-this-month"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef, dctYear + "-" + norm.getFromNormNumber(dctMonth+"")); } else { String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates,i,"month", language); if (lmMonth.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX"); } else { valueNew = valueNew.replace(checkUndef, lmMonth); } } } else if (value_i.startsWith("UNDEF-next-month")) { String checkUndef = "UNDEF-next-month"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextMonth(dctYear + "-" + norm.getFromNormNumber(dctMonth+""), 1)); } else { String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates,i,"month", language); if (lmMonth.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX"); } else { valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextMonth(lmMonth, 1)); } } } // day else if (value_i.startsWith("UNDEF-last-day")) { String checkUndef = "UNDEF-last-day"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + norm.getFromNormNumber(dctMonth+"") + "-"+ dctDay, -1)); } else { String lmDay = ContextAnalyzer.getLastMentionedX(linearDates,i,"day", language); if (lmDay.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX"); } else { valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay,-1)); } } } else if (value_i.startsWith("UNDEF-this-day")) { String checkUndef = "UNDEF-this-day"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef, dctYear + "-" + norm.getFromNormNumber(dctMonth+"") + "-"+ norm.getFromNormNumber(dctDay+"")); } else { String lmDay = ContextAnalyzer.getLastMentionedX(linearDates,i,"day", language); if (lmDay.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX"); } else { valueNew = valueNew.replace(checkUndef, lmDay); } if (value_i.equals("UNDEF-this-day")) { valueNew = "PRESENT_REF"; } } } else if (value_i.startsWith("UNDEF-next-day")) { String checkUndef = "UNDEF-next-day"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + norm.getFromNormNumber(dctMonth+"") + "-"+ dctDay, 1)); } else { String lmDay = ContextAnalyzer.getLastMentionedX(linearDates,i,"day", language); if (lmDay.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX"); } else { valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay,1)); } } } // week else if (value_i.startsWith("UNDEF-last-week")) { String checkUndef = "UNDEF-last-week"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextWeek(dctYear+"-W"+norm.getFromNormNumber(dctWeek+""),-1, language)); } else { String lmWeek = ContextAnalyzer.getLastMentionedX(linearDates,i,"week", language); if (lmWeek.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-WXX"); } else { valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextWeek(lmWeek,-1, language)); } } } else if (value_i.startsWith("UNDEF-this-week")) { String checkUndef = "UNDEF-this-week"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef,dctYear+"-W"+norm.getFromNormNumber(dctWeek+"")); } else { String lmWeek = ContextAnalyzer.getLastMentionedX(linearDates,i,"week", language); if (lmWeek.equals("")) { valueNew = valueNew.replace(checkUndef,"XXXX-WXX"); } else { valueNew = valueNew.replace(checkUndef,lmWeek); } } } else if (value_i.startsWith("UNDEF-next-week")) { String checkUndef = "UNDEF-next-week"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextWeek(dctYear+"-W"+norm.getFromNormNumber(dctWeek+""),1, language)); } else { String lmWeek = ContextAnalyzer.getLastMentionedX(linearDates,i,"week", language); if (lmWeek.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-WXX"); } else { valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextWeek(lmWeek,1, language)); } } } // quarter else if (value_i.startsWith("UNDEF-last-quarter")) { String checkUndef = "UNDEF-last-quarter"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { if (dctQuarter.equals("Q1")) { valueNew = valueNew.replace(checkUndef, dctYear-1+"-Q4"); } else { int newQuarter = Integer.parseInt(dctQuarter.substring(1,2))-1; valueNew = valueNew.replace(checkUndef, dctYear+"-Q"+newQuarter); } } else { String lmQuarter = ContextAnalyzer.getLastMentionedX(linearDates, i, "quarter", language); if (lmQuarter.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-QX"); } else { int lmQuarterOnly = Integer.parseInt(lmQuarter.substring(6,7)); int lmYearOnly = Integer.parseInt(lmQuarter.substring(0,4)); if (lmQuarterOnly == 1) { valueNew = valueNew.replace(checkUndef, lmYearOnly-1+"-Q4"); } else { int newQuarter = lmQuarterOnly-1; valueNew = valueNew.replace(checkUndef, dctYear+"-Q"+newQuarter); } } } } else if (value_i.startsWith("UNDEF-this-quarter")) { String checkUndef = "UNDEF-this-quarter"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef, dctYear+"-"+dctQuarter); } else { String lmQuarter = ContextAnalyzer.getLastMentionedX(linearDates, i, "quarter", language); if (lmQuarter.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-QX"); } else { valueNew = valueNew.replace(checkUndef, lmQuarter); } } } else if (value_i.startsWith("UNDEF-next-quarter")) { String checkUndef = "UNDEF-next-quarter"; if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { if (dctQuarter.equals("Q4")) { valueNew = valueNew.replace(checkUndef, dctYear+1+"-Q1"); } else { int newQuarter = Integer.parseInt(dctQuarter.substring(1,2))+1; valueNew = valueNew.replace(checkUndef, dctYear+"-Q"+newQuarter); } } else { String lmQuarter = ContextAnalyzer.getLastMentionedX(linearDates, i, "quarter", language); if (lmQuarter.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-QX"); } else { int lmQuarterOnly = Integer.parseInt(lmQuarter.substring(6,7)); int lmYearOnly = Integer.parseInt(lmQuarter.substring(0,4)); if (lmQuarterOnly == 4) { valueNew = valueNew.replace(checkUndef, lmYearOnly+1+"-Q1"); } else { int newQuarter = lmQuarterOnly+1; valueNew = valueNew.replace(checkUndef, dctYear+"-Q"+newQuarter); } } } } // MONTH NAMES else if (value_i.matches("UNDEF-(last|this|next)-(january|february|march|april|may|june|july|august|september|october|november|december).*")) { for (MatchResult mr : Toolbox.findMatches(Pattern.compile("(UNDEF-(last|this|next)-(january|february|march|april|may|june|july|august|september|october|november|december))(.*)"),value_i)) { String rest = mr.group(4); int day = 0; for (MatchResult mr_rest : Toolbox.findMatches(Pattern.compile("-([0-9][0-9])"),rest)){ day = Integer.parseInt(mr_rest.group(1)); } String checkUndef = mr.group(1); String ltn = mr.group(2); String newMonth = norm.getFromNormMonthName((mr.group(3))); int newMonthInt = Integer.parseInt(newMonth); if (ltn.equals("last")) { if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { // check day if dct-month and newMonth are equal if ((dctMonth == newMonthInt) && (!(day == 0))){ if (dctDay > day){ valueNew = valueNew.replace(checkUndef, dctYear+"-"+newMonth); } else{ valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newMonth); } } else if (dctMonth <= newMonthInt) { valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newMonth); } else { valueNew = valueNew.replace(checkUndef, dctYear+"-"+newMonth); } } else { String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates, i, "month-with-details", language); if (lmMonth.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX"); } else { int lmMonthInt = Integer.parseInt(lmMonth.substring(5,7)); // int lmDayInt = 0; if ((lmMonth.length() > 9) && (lmMonth.subSequence(8,10).toString().matches("\\d\\d"))){ lmDayInt = Integer.parseInt(lmMonth.subSequence(8,10)+""); } if ((lmMonthInt == newMonthInt) && (!(lmDayInt == 0)) && (!(day == 0))){ if (lmDayInt > day){ valueNew = valueNew.replace(checkUndef, lmMonth.substring(0,4)+"-"+newMonth); } else{ valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmMonth.substring(0,4))-1+"-"+newMonth); } } if (lmMonthInt <= newMonthInt) { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmMonth.substring(0,4))-1+"-"+newMonth); } else { valueNew = valueNew.replace(checkUndef, lmMonth.substring(0,4)+"-"+newMonth); } } } } else if (ltn.equals("this")) { if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { valueNew = valueNew.replace(checkUndef, dctYear+"-"+newMonth); } else { String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates, i, "month-with-details", language); if (lmMonth.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX"); } else { valueNew = valueNew.replace(checkUndef, lmMonth.substring(0,4)+"-"+newMonth); } } } else if (ltn.equals("next")) { if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { // check day if dct-month and newMonth are equal if ((dctMonth == newMonthInt) && (!(day == 0))){ if (dctDay < day){ valueNew = valueNew.replace(checkUndef, dctYear+"-"+newMonth); } else{ valueNew = valueNew.replace(checkUndef, dctYear+1+"-"+newMonth); } } else if (dctMonth >= newMonthInt) { valueNew = valueNew.replace(checkUndef, dctYear+1+"-"+newMonth); } else { valueNew = valueNew.replace(checkUndef, dctYear+"-"+newMonth); } } else { String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates, i, "month-with-details", language); if (lmMonth.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX"); } else { int lmMonthInt = Integer.parseInt(lmMonth.substring(5,7)); if (lmMonthInt >= newMonthInt) { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmMonth.substring(0,4))+1+"-"+newMonth); } else { valueNew = valueNew.replace(checkUndef, lmMonth.substring(0,4)+"-"+newMonth); } } } } } } // SEASONS NAMES else if (value_i.matches("^UNDEF-(last|this|next)-(SP|SU|FA|WI).*")) { for (MatchResult mr : Toolbox.findMatches(Pattern.compile("(UNDEF-(last|this|next)-(SP|SU|FA|WI)).*"),value_i)) { String checkUndef = mr.group(1); String ltn = mr.group(2); String newSeason = mr.group(3); if (ltn.equals("last")) { if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { if (dctSeason.equals("SP")) { valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newSeason); } else if (dctSeason.equals("SU")) { if (newSeason.equals("SP")) { valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason); } else { valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newSeason); } } else if (dctSeason.equals("FA")) { if ((newSeason.equals("SP")) || (newSeason.equals("SU"))) { valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason); } else { valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newSeason); } } else if (dctSeason.equals("WI")) { if (newSeason.equals("WI")) { valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newSeason); } else { if (dctMonth < 12){ valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newSeason); } else{ valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason); } } } } else { // NARRATVIE DOCUMENT String lmSeason = ContextAnalyzer.getLastMentionedX(linearDates, i, "season", language); if (lmSeason.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX"); } else { if (lmSeason.substring(5,7).equals("SP")) { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))-1+"-"+newSeason); } else if (lmSeason.substring(5,7).equals("SU")) { if (lmSeason.substring(5,7).equals("SP")) { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+"-"+newSeason); } else { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))-1+"-"+newSeason); } } else if (lmSeason.substring(5,7).equals("FA")) { if ((newSeason.equals("SP")) || (newSeason.equals("SU"))) { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+"-"+newSeason); } else { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))-1+"-"+newSeason); } } else if (lmSeason.substring(5,7).equals("WI")) { if (newSeason.equals("WI")) { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))-1+"-"+newSeason); } else { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+"-"+newSeason); } } } } } else if (ltn.equals("this")) { if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { // TODO include tense of sentence? valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason); } else { // TODO include tense of sentence? String lmSeason = ContextAnalyzer.getLastMentionedX(linearDates, i, "season", language); if (lmSeason.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX"); } else { valueNew = valueNew.replace(checkUndef, lmSeason.substring(0,4)+"-"+newSeason); } } } else if (ltn.equals("next")) { if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { if (dctSeason.equals("SP")) { if (newSeason.equals("SP")) { valueNew = valueNew.replace(checkUndef, dctYear+1+"-"+newSeason); } else { valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason); } } else if (dctSeason.equals("SU")) { if ((newSeason.equals("SP")) || (newSeason.equals("SU"))) { valueNew = valueNew.replace(checkUndef, dctYear+1+"-"+newSeason); } else { valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason); } } else if (dctSeason.equals("FA")) { if (newSeason.equals("WI")) { valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason); } else { valueNew = valueNew.replace(checkUndef, dctYear+1+"-"+newSeason); } } else if (dctSeason.equals("WI")) { valueNew = valueNew.replace(checkUndef, dctYear+1+"-"+newSeason); } } else { // NARRATIVE DOCUMENT String lmSeason = ContextAnalyzer.getLastMentionedX(linearDates, i, "season", language); if (lmSeason.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX"); } else { if (lmSeason.substring(5,7).equals("SP")) { if (newSeason.equals("SP")) { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+1+"-"+newSeason); } else { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+"-"+newSeason); } } else if (lmSeason.substring(5,7).equals("SU")) { if ((newSeason.equals("SP")) || (newSeason.equals("SU"))) { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+1+"-"+newSeason); } else { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+"-"+newSeason); } } else if (lmSeason.substring(5,7).equals("FA")) { if (newSeason.equals("WI")) { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+"-"+newSeason); } else { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+1+"-"+newSeason); } } else if (lmSeason.substring(5,7).equals("WI")) { valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+1+"-"+newSeason); } } } } } } // WEEKDAY NAMES // TODO the calculation is strange, but works // TODO tense should be included?! else if (value_i.matches("^UNDEF-(last|this|next|day)-(monday|tuesday|wednesday|thursday|friday|saturday|sunday).*")) { for (MatchResult mr : Toolbox.findMatches(Pattern.compile("(UNDEF-(last|this|next|day)-(monday|tuesday|wednesday|thursday|friday|saturday|sunday)).*"),value_i)) { String checkUndef = mr.group(1); String ltnd = mr.group(2); String newWeekday = mr.group(3); int newWeekdayInt = Integer.parseInt(norm.getFromNormDayInWeek(newWeekday)); if (ltnd.equals("last")) { if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { int diff = (-1) * (dctWeekday - newWeekdayInt); if (diff >= 0) { diff = diff - 7; } valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + dctMonth + "-" + dctDay, diff)); } else { String lmDay = ContextAnalyzer.getLastMentionedX(linearDates, i, "day", language); if (lmDay.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX"); } else { int lmWeekdayInt = DateCalculator.getWeekdayOfDate(lmDay); int diff = (-1) * (lmWeekdayInt - newWeekdayInt); if (diff >= 0) { diff = diff - 7; } valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay, diff)); } } } else if (ltnd.equals("this")) { if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { // TODO tense should be included?! int diff = (-1) * (dctWeekday - newWeekdayInt); if (diff >= 0) { diff = diff - 7; } if (diff == -7) { diff = 0; } valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + dctMonth + "-"+ dctDay, diff)); } else { // TODO tense should be included?! String lmDay = ContextAnalyzer.getLastMentionedX(linearDates, i, "day", language); if (lmDay.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX"); } else { int lmWeekdayInt = DateCalculator.getWeekdayOfDate(lmDay); int diff = (-1) * (lmWeekdayInt - newWeekdayInt); if (diff >= 0) { diff = diff - 7; } if (diff == -7) { diff = 0; } valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay, diff)); } } } else if (ltnd.equals("next")) { if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { int diff = newWeekdayInt - dctWeekday; if (diff <= 0) { diff = diff + 7; } valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + dctMonth + "-"+ dctDay, diff)); } else { String lmDay = ContextAnalyzer.getLastMentionedX(linearDates, i, "day", language); if (lmDay.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX"); } else { int lmWeekdayInt = DateCalculator.getWeekdayOfDate(lmDay); int diff = newWeekdayInt - lmWeekdayInt; if (diff <= 0) { diff = diff + 7; } valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay, diff)); } } } else if (ltnd.equals("day")) { if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { // TODO tense should be included?! int diff = (-1) * (dctWeekday - newWeekdayInt); if (diff >= 0) { diff = diff - 7; } if (diff == -7) { diff = 0; } // Tense is FUTURE if ((last_used_tense.equals("FUTURE")) && diff != 0) { diff = diff + 7; } // Tense is PAST if ((last_used_tense.equals("PAST"))) { } valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + dctMonth + "-"+ dctDay, diff)); } else { // TODO tense should be included?! String lmDay = ContextAnalyzer.getLastMentionedX(linearDates, i, "day", language); if (lmDay.equals("")) { valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX"); } else { int lmWeekdayInt = DateCalculator.getWeekdayOfDate(lmDay); int diff = (-1) * (lmWeekdayInt - newWeekdayInt); if (diff >= 0) { diff = diff - 7; } if (diff == -7) { diff = 0; } valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay, diff)); } } } } } else { Logger.printDetail(component, "ATTENTION: UNDEF value for: " + valueNew+" is not handled in disambiguation phase!"); } } t_i.removeFromIndexes(); Logger.printDetail(t_i.getTimexId()+" DISAMBIGUATION PHASE: foundBy:"+t_i.getFoundByRule()+" text:"+t_i.getCoveredText()+" value:"+t_i.getTimexValue()+" NEW value:"+valueNew); t_i.setTimexValue(valueNew); t_i.addToIndexes(); linearDates.set(i, t_i); } } /** * @param jcas */ public void deleteOverlappedTimexes(JCas jcas) { FSIterator timexIter1 = jcas.getAnnotationIndex(Timex3.type).iterator(); HashSet<Timex3> hsTimexesToRemove = new HashSet<Timex3>(); while (timexIter1.hasNext()) { Timex3 t1 = (Timex3) timexIter1.next(); FSIterator timexIter2 = jcas.getAnnotationIndex(Timex3.type) .iterator(); while (timexIter2.hasNext()) { Timex3 t2 = (Timex3) timexIter2.next(); if (((t1.getBegin() >= t2.getBegin()) && (t1.getEnd() < t2.getEnd())) || // t1 starts inside or with t2 and ends before t2 -> remove t1 ((t1.getBegin() > t2.getBegin()) && (t1.getEnd() <= t2.getEnd()))) { // t1 starts inside t2 and ends with or before t2 -> remove t1 hsTimexesToRemove.add(t1); } else if (((t2.getBegin() >= t1.getBegin()) && (t2.getEnd() < t1.getEnd())) || // t2 starts inside or with t1 and ends before t1 -> remove t2 ((t2.getBegin() > t1.getBegin()) && (t2.getEnd() <= t1.getEnd()))) { // t2 starts inside t1 and ends with or before t1 -> remove t2 hsTimexesToRemove.add(t2); } // identical length if ((t1.getBegin() == t2.getBegin()) && (t1.getEnd() == t2.getEnd())) { if ((t1.getTimexType().equals("SET")) || (t2.getTimexType().equals("SET"))) { // REMOVE REAL DUPLICATES (the one with the lower timexID) if ((Integer.parseInt(t1.getTimexId().substring(1)) < Integer.parseInt(t2.getTimexId().substring(1)))) { hsTimexesToRemove.add(t1); } } else { if (!(t1.equals(t2))) { if ((t1.getTimexValue().startsWith("UNDEF")) && (!(t2.getTimexValue().startsWith("UNDEF")))) { hsTimexesToRemove.add(t1); } else if ((!(t1.getTimexValue().startsWith("UNDEF"))) && (t2.getTimexValue().startsWith("UNDEF"))) { hsTimexesToRemove.add(t2); } // t1 is explicit, but t2 is not else if ((t1.getFoundByRule().endsWith("explicit")) && (!(t2.getFoundByRule().endsWith("explicit")))) { hsTimexesToRemove.add(t2); } // REMOVE REAL DUPLICATES (the one with the lower timexID) else if ((Integer.parseInt(t1.getTimexId().substring(1)) < Integer.parseInt(t2.getTimexId().substring(1)))) { hsTimexesToRemove.add(t1); } } } } } } // remove, finally for (Timex3 t : hsTimexesToRemove) { Logger.printDetail(t.getTimexId()+"REMOVE DUPLICATE: " + t.getCoveredText()+"(id:"+t.getTimexId()+" value:"+t.getTimexValue()+" found by:"+t.getFoundByRule()+")"); t.removeFromIndexes(); timex_counter--; } } /** * Identify the part of speech (POS) of a MarchResult. * @param tokBegin * @param tokEnd * @param s * @param jcas * @return */ public String getPosFromMatchResult(int tokBegin, int tokEnd, Sentence s, JCas jcas) { // get all tokens in sentence HashMap<Integer, Token> hmTokens = new HashMap<Integer, Token>(); FSIterator iterTok = jcas.getAnnotationIndex(Token.type).subiterator(s); while (iterTok.hasNext()) { Token token = (Token) iterTok.next(); hmTokens.put(token.getBegin(), token); } // get correct token String pos = ""; if (hmTokens.containsKey(tokBegin)) { Token tokenToCheck = hmTokens.get(tokBegin); pos = tokenToCheck.getPos(); } return pos; } /** * Apply the extraction rules, normalization rules * @param timexType * @param hmPattern * @param hmOffset * @param hmNormalization * @param hmQuant * @param s * @param jcas */ public void findTimexes(String timexType, HashMap<Pattern, String> hmPattern, HashMap<String, String> hmOffset, HashMap<String, String> hmNormalization, HashMap<String, String> hmQuant, Sentence s, JCas jcas) { RuleManager rm = RuleManager.getInstance(language); HashMap<String, String> hmDatePosConstraint = rm.getHmDatePosConstraint(); HashMap<String, String> hmDurationPosConstraint = rm.getHmDurationPosConstraint(); HashMap<String, String> hmTimePosConstraint = rm.getHmTimePosConstraint(); HashMap<String, String> hmSetPosConstraint = rm.getHmSetPosConstraint(); // Iterator over the rules by sorted by the name of the rules // this is important since later, the timexId will be used to // decide which of two expressions shall be removed if both // have the same offset for (Iterator<Pattern> i = Toolbox.sortByValue(hmPattern).iterator(); i.hasNext(); ) { Pattern p = (Pattern) i.next(); for (MatchResult r : Toolbox.findMatches(p, s.getCoveredText())) { boolean infrontBehindOK = ContextAnalyzer.checkTokenBoundaries(r, s, jcas) // improved token boundary checking && ContextAnalyzer.checkInfrontBehind(r, s); boolean posConstraintOK = true; // CHECK POS CONSTRAINTS if (timexType.equals("DATE")) { if (hmDatePosConstraint.containsKey(hmPattern.get(p))) { posConstraintOK = checkPosConstraint(s , hmDatePosConstraint.get(hmPattern.get(p)), r, jcas); } } else if (timexType.equals("DURATION")) { if (hmDurationPosConstraint.containsKey(hmPattern.get(p))) { posConstraintOK = checkPosConstraint(s , hmDurationPosConstraint.get(hmPattern.get(p)), r, jcas); } } else if (timexType.equals("TIME")) { if (hmTimePosConstraint.containsKey(hmPattern.get(p))) { posConstraintOK = checkPosConstraint(s , hmTimePosConstraint.get(hmPattern.get(p)), r, jcas); } } else if (timexType.equals("SET")) { if (hmSetPosConstraint.containsKey(hmPattern.get(p))) { posConstraintOK = checkPosConstraint(s , hmSetPosConstraint.get(hmPattern.get(p)), r, jcas); } } if ((infrontBehindOK == true) && (posConstraintOK == true)) { // Offset of timex expression (in the checked sentence) int timexStart = r.start(); int timexEnd = r.end(); // Normalization from Files: // Any offset parameter? if (hmOffset.containsKey(hmPattern.get(p))) { String offset = hmOffset.get(hmPattern.get(p)); // pattern for offset information Pattern paOffset = Pattern.compile("group\\(([0-9]+)\\)-group\\(([0-9]+)\\)"); for (MatchResult mr : Toolbox.findMatches(paOffset,offset)) { int startOffset = Integer.parseInt(mr.group(1)); int endOffset = Integer.parseInt(mr.group(2)); timexStart = r.start(startOffset); timexEnd = r.end(endOffset); } } // Normalization Parameter if (hmNormalization.containsKey(hmPattern.get(p))) { String[] attributes = new String[4]; if (timexType.equals("DATE")) { attributes = getAttributesForTimexFromFile(hmPattern.get(p), rm.getHmDateNormalization(), rm.getHmDateQuant(), rm.getHmDateFreq(), rm.getHmDateMod(), r, jcas); } else if (timexType.equals("DURATION")) { attributes = getAttributesForTimexFromFile(hmPattern.get(p), rm.getHmDurationNormalization(), rm.getHmDurationQuant(), rm.getHmDurationFreq(), rm.getHmDurationMod(), r, jcas); } else if (timexType.equals("TIME")) { attributes = getAttributesForTimexFromFile(hmPattern.get(p), rm.getHmTimeNormalization(), rm.getHmTimeQuant(), rm.getHmTimeFreq(), rm.getHmTimeMod(), r, jcas); } else if (timexType.equals("SET")) { attributes = getAttributesForTimexFromFile(hmPattern.get(p), rm.getHmSetNormalization(), rm.getHmSetQuant(), rm.getHmSetFreq(), rm.getHmSetMod(), r, jcas); } addTimexAnnotation(timexType, timexStart + s.getBegin(), timexEnd + s.getBegin(), s, attributes[0], attributes[1], attributes[2], attributes[3],"t" + timexID++, hmPattern.get(p), jcas); } else { Logger.printError("SOMETHING REALLY WRONG HERE: "+hmPattern.get(p)); } } } } } /** * Check whether the part of speech constraint defined in a rule is satisfied. * @param s * @param posConstraint * @param m * @param jcas * @return */ public boolean checkPosConstraint(Sentence s, String posConstraint, MatchResult m, JCas jcas) { Pattern paConstraint = Pattern.compile("group\\(([0-9]+)\\):(.*?):"); for (MatchResult mr : Toolbox.findMatches(paConstraint,posConstraint)) { int groupNumber = Integer.parseInt(mr.group(1)); int tokenBegin = s.getBegin() + m.start(groupNumber); int tokenEnd = s.getBegin() + m.end(groupNumber); String pos = mr.group(2); String pos_as_is = getPosFromMatchResult(tokenBegin, tokenEnd ,s, jcas); if (pos.equals(pos_as_is)) { Logger.printDetail("POS CONSTRAINT IS VALID: pos should be "+pos+" and is "+pos_as_is); } else { return false; } } return true; } public String applyRuleFunctions(String tonormalize, MatchResult m) { NormalizationManager norm = NormalizationManager.getInstance(language); String normalized = ""; // pattern for normalization functions + group information // pattern for group information Pattern paNorm = Pattern.compile("%([A-Za-z0-9]+?)\\(group\\(([0-9]+)\\)\\)"); Pattern paGroup = Pattern.compile("group\\(([0-9]+)\\)"); while ((tonormalize.contains("%")) || (tonormalize.contains("group"))) { // replace normalization functions for (MatchResult mr : Toolbox.findMatches(paNorm,tonormalize)) { Logger.printDetail("-----------------------------------"); Logger.printDetail("DEBUGGING: tonormalize:"+tonormalize); Logger.printDetail("DEBUGGING: mr.group():"+mr.group()); Logger.printDetail("DEBUGGING: mr.group(1):"+mr.group(1)); Logger.printDetail("DEBUGGING: mr.group(2):"+mr.group(2)); Logger.printDetail("DEBUGGING: m.group():"+m.group()); Logger.printDetail("DEBUGGING: m.group("+Integer.parseInt(mr.group(2))+"):"+m.group(Integer.parseInt(mr.group(2)))); Logger.printDetail("DEBUGGING: hmR...:"+norm.getFromHmAllNormalization(mr.group(1)).get(m.group(Integer.parseInt(mr.group(2))))); Logger.printDetail("-----------------------------------"); if (! (m.group(Integer.parseInt(mr.group(2))) == null)) { String partToReplace = m.group(Integer.parseInt(mr.group(2))).replaceAll("[\n\\s]+", " "); if (!(norm.getFromHmAllNormalization(mr.group(1)).containsKey(partToReplace))) { Logger.printDetail("Maybe problem with normalization of the resource: "+mr.group(1)); Logger.printDetail("Maybe problem with part to replace? "+partToReplace); } tonormalize = tonormalize.replace(mr.group(), norm.getFromHmAllNormalization(mr.group(1)).get(partToReplace)); } else { Logger.printDetail("Empty part to normalize in "+mr.group(1)); tonormalize = tonormalize.replace(mr.group(), ""); } } // replace other groups for (MatchResult mr : Toolbox.findMatches(paGroup,tonormalize)) { Logger.printDetail("-----------------------------------"); Logger.printDetail("DEBUGGING: tonormalize:"+tonormalize); Logger.printDetail("DEBUGGING: mr.group():"+mr.group()); Logger.printDetail("DEBUGGING: mr.group(1):"+mr.group(1)); Logger.printDetail("DEBUGGING: m.group():"+m.group()); Logger.printDetail("DEBUGGING: m.group("+Integer.parseInt(mr.group(1))+"):"+m.group(Integer.parseInt(mr.group(1)))); Logger.printDetail("-----------------------------------"); tonormalize = tonormalize.replace(mr.group(), m.group(Integer.parseInt(mr.group(1)))); } // replace substrings Pattern paSubstring = Pattern.compile("%SUBSTRING%\\((.*?),([0-9]+),([0-9]+)\\)"); for (MatchResult mr : Toolbox.findMatches(paSubstring,tonormalize)) { String substring = mr.group(1).substring(Integer.parseInt(mr.group(2)), Integer.parseInt(mr.group(3))); tonormalize = tonormalize.replace(mr.group(),substring); } if(language.getName().compareTo("arabic") != 0) { // replace lowercase Pattern paLowercase = Pattern.compile("%LOWERCASE%\\((.*?)\\)"); for (MatchResult mr : Toolbox.findMatches(paLowercase,tonormalize)) { String substring = mr.group(1).toLowerCase(); tonormalize = tonormalize.replace(mr.group(),substring); } // replace uppercase Pattern paUppercase = Pattern.compile("%UPPERCASE%\\((.*?)\\)"); for (MatchResult mr : Toolbox.findMatches(paUppercase,tonormalize)) { String substring = mr.group(1).toUpperCase(); tonormalize = tonormalize.replace(mr.group(),substring); } } // replace sum, concatenation Pattern paSum = Pattern.compile("%SUM%\\((.*?),(.*?)\\)"); for (MatchResult mr : Toolbox.findMatches(paSum,tonormalize)) { int newValue = Integer.parseInt(mr.group(1)) + Integer.parseInt(mr.group(2)); tonormalize = tonormalize.replace(mr.group(), newValue+""); } // replace normalization function without group Pattern paNormNoGroup = Pattern.compile("%([A-Za-z0-9]+?)\\((.*?)\\)"); for (MatchResult mr : Toolbox.findMatches(paNormNoGroup, tonormalize)) { tonormalize = tonormalize.replace(mr.group(),norm.getFromHmAllNormalization(mr.group(1)).get(mr.group(2))); } } normalized = tonormalize; return normalized; } public String[] getAttributesForTimexFromFile(String rule, HashMap<String, String> hmNormalization, HashMap<String, String> hmQuant, HashMap<String, String> hmFreq, HashMap<String, String> hmMod, MatchResult m, JCas jcas) { String[] attributes = new String[4]; String value = ""; String quant = ""; String freq = ""; String mod = ""; // Normalize Value String value_normalization_pattern = hmNormalization.get(rule); value = applyRuleFunctions(value_normalization_pattern, m); // get quant if (hmQuant.containsKey(rule)) { String quant_normalization_pattern = hmQuant.get(rule); quant = applyRuleFunctions(quant_normalization_pattern, m); } // get freq if (hmFreq.containsKey(rule)) { String freq_normalization_pattern = hmFreq.get(rule); freq = applyRuleFunctions(freq_normalization_pattern, m); } // get mod if (hmMod.containsKey(rule)) { String mod_normalization_pattern = hmMod.get(rule); mod = applyRuleFunctions(mod_normalization_pattern, m); } // For example "P24H" -> "P1D" value = correctDurationValue(value); attributes[0] = value; attributes[1] = quant; attributes[2] = freq; attributes[3] = mod; return attributes; } /** * Durations of a finer granularity are mapped to a coarser one if possible, e.g., "PT24H" -> "P1D". * One may add several further corrections. * @param value * @return */ public static String correctDurationValue(String value) { if (value.matches("PT[0-9]+H")){ for (MatchResult mr : Toolbox.findMatches(Pattern.compile("PT([0-9]+)H"), value)){ int hours = Integer.parseInt(mr.group(1)); if ((hours % 24) == 0){ int days = hours / 24; value = "P"+days+"D"; } } } else if (value.matches("PT[0-9]+M")){ for (MatchResult mr : Toolbox.findMatches(Pattern.compile("PT([0-9]+)M"), value)){ int minutes = Integer.parseInt(mr.group(1)); if ((minutes % 60) == 0){ int hours = minutes / 60; value = "PT"+hours+"H"; } } } else if (value.matches("P[0-9]+M")){ for (MatchResult mr : Toolbox.findMatches(Pattern.compile("P([0-9]+)M"), value)){ int months = Integer.parseInt(mr.group(1)); if ((months % 12) == 0){ int years = months / 12; value = "P"+years+"Y"; } } } return value; } }