package de.unihd.dbs.heideltime.standalone; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Date; import java.util.GregorianCalendar; import java.util.HashMap; import java.util.Locale; import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; import de.unihd.dbs.heideltime.standalone.DocumentType; import de.unihd.dbs.heideltime.standalone.HeidelTimeStandalone; import de.unihd.dbs.heideltime.standalone.OutputType; import de.unihd.dbs.heideltime.standalone.components.ResultFormatter; import de.unihd.dbs.heideltime.standalone.components.impl.TimeMLResultFormatter; import de.unihd.dbs.heideltime.standalone.exceptions.DocumentCreationTimeMissingException; import de.unihd.dbs.uima.annotator.heideltime.resources.Language; import edu.stanford.nlp.util.Triple; public class HeidelTimeAnnotator { private static Pattern timex3Date = Pattern.compile("<TIMEX3 tid=\"t(\\d+)\" type=\"DATE\" value=\"([^\"]+)\">([^<]+)</TIMEX3>", Pattern.MULTILINE); private static Pattern timex3Duration = Pattern.compile("<TIMEX3 tid=\"t(\\d+)\" type=\"DURATION\" value=\"([^\"]+)\">([^<]+)</TIMEX3>", Pattern.MULTILINE); private static String pastRef = "PAST_REF"; private static String presentRef = "PRESENT_REF"; private static String futureRef = "FUTURE_REF"; private static DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); private static DateFormat monthFormat = new SimpleDateFormat("yyyy-MM"); private static ResultFormatter resultFormatter = new TimeMLResultFormatter(); public static ArrayList<Triple<String, String, String>> annotate(String processed, String pubDateStr) { // a list of time periods (startDate,endDate) ArrayList<Triple<String, String, String>> timePeriodList = new ArrayList<Triple<String, String, String>>(); try { Date pubDate = dateFormat.parse(pubDateStr); // String processed = heidelTime.process(sentence, pubDate, resultFormatter); // System.out.println("Heidel Time: " + processed); // 1) Match day, month and present Matcher date = timex3Date.matcher(processed); while (date.find()) { int tid = Integer.parseInt(date.group(1)); String value = date.group(2); String annotated = date.group(3); //// ** System.out.println("\tDATE: tid[" + tid + "] value=" + value + " annotated=" + annotated); // future if(value.equals(futureRef)){ //stime, etime, annotated string Triple<String, String, String> triple = new Triple<String, String, String>(pubDateStr,pubDateStr,annotated); timePeriodList.add(triple); } // present else if(value.equals(presentRef)){ //stime, etime, annotated string Triple<String, String, String> triple = new Triple<String, String, String>(pubDateStr,pubDateStr,annotated); timePeriodList.add(triple); } // past else if(value.equals(pastRef)){ // today if(annotated.contains("recent")) { //stime, etime, annotated string Triple<String, String, String> triple = new Triple<String, String, String>(pubDateStr,pubDateStr,annotated);; timePeriodList.add(triple); } // few days else if(annotated.contains("days")) { // including today // 4 days ago Calendar newCal = new GregorianCalendar(Locale.US); newCal.setTime(pubDate); newCal.add(Calendar.DATE, -4); String formatDate = dateFormat.format(newCal.getTime()); //stime, etime, annotated string Triple<String, String, String> triple = new Triple<String, String, String>(formatDate,pubDateStr,annotated); timePeriodList.add(triple); } // few weeks else if(annotated.contains("weeks")) { // including today // 4 weeks ago Calendar newCal = new GregorianCalendar(Locale.US); newCal.setTime(pubDate); newCal.add(Calendar.DATE, -(4*7)); String formatDate = dateFormat.format(newCal.getTime()); //stime, etime, annotated string Triple<String, String, String> triple = new Triple<String, String, String>(formatDate,pubDateStr,annotated); timePeriodList.add(triple); } // few months else if(annotated.contains("months")) { // including today // 3 months ago Calendar newCal = new GregorianCalendar(Locale.US); newCal.setTime(pubDate); newCal.add(Calendar.MONTH, -3); String formatDate = dateFormat.format(newCal.getTime()); //stime, etime, annotated string Triple<String, String, String> triple = new Triple<String, String, String>(formatDate,pubDateStr,annotated); timePeriodList.add(triple); } } // DATE=month // Note: the date is not relative to publication date! else if(value.length() == "yyyy-MM".length()) { String[] tokens = value.split("-"); int year = Integer.parseInt(tokens[0]); int month = Integer.parseInt(tokens[1]); Calendar newCal = new GregorianCalendar(Locale.US); newCal.set(Calendar.DAY_OF_MONTH, 1); newCal.set(Calendar.YEAR, year); newCal.set(Calendar.MONTH, month); // This is equivalent to the next month, where this was a bug previously (28/01/12) newCal.add(Calendar.DAY_OF_MONTH, -1); // The Calendar.MONTH starts from 0 (=January) // first day of the month // last day of the month //stime, etime, annotated string Triple<String, String, String> triple = new Triple<String, String, String>(value + "-01", value + "-" + newCal.get(Calendar.DAY_OF_MONTH),annotated); timePeriodList.add(triple); } // DATE=date // Note: the date is not relative to publication date! else if(value.length() == "yyyy-MM-dd".length()) { //stime, etime, annotated string Triple<String, String, String> triple = new Triple<String, String, String>(value, value, annotated); timePeriodList.add(triple); } // DATE=week of year // Note: the date is not relative to publication date! else if(value.endsWith("-W")) { String[] tokens = value.split("-W"); int year = Integer.parseInt(tokens[0]); int weekOfYear = Integer.parseInt(tokens[1]); Calendar newCal = new GregorianCalendar(Locale.US); newCal.set(Calendar.YEAR, year); newCal.set(Calendar.WEEK_OF_YEAR, weekOfYear); //stime String formatDate = dateFormat.format(newCal.getTime()); // add a period of 1 week (= 7 days) newCal.add(Calendar.DATE, 7); //etime String _formatDate = dateFormat.format(newCal.getTime()); //stime, etime, annotated string Triple<String, String, String> triple = new Triple<String, String, String>(formatDate, _formatDate, annotated); timePeriodList.add(triple); } // DATE=a season of the year // Note: ignore this type of date since time-zone normalization is needed else if(value.endsWith("-FA") || // fall value.endsWith("-WI") || // winter value.endsWith("-SP") || // spring value.endsWith("-SU")) { // summer //// ** System.out.println("\t\tIgnore a season of the year."); } } // 2) Match duration, i.e., days, weeks, months // Note, a duration is a past event, where only a 3-month period is consider Matcher duration = timex3Duration.matcher(processed); while (duration.find()) { int tid = Integer.parseInt(duration.group(1)); String value = duration.group(2); String annotated = duration.group(3); if(!value.endsWith("Y")) { //// ** System.out.println("\tDURATION: tid[" + tid + "] value=" + value + " annotated=" + annotated); // a day range if(value.startsWith("P") && value.endsWith("D")) { String dayStr = value.substring(1, value.length()-1); // a few days int days = 0; if(dayStr.equals("X")) { days = 4; } else { days = Integer.parseInt(dayStr); } // including today // X days ago Calendar newCal = new GregorianCalendar(Locale.US); newCal.setTime(pubDate); newCal.add(Calendar.DATE, -days); //stime String formatDate = dateFormat.format(newCal.getTime()); ///stime, etime, annotated string Triple<String, String, String> triple = new Triple<String, String, String>(formatDate,pubDateStr,annotated); timePeriodList.add(triple); } // a week range else if(value.startsWith("P") && value.endsWith("W")) { String weekStr = value.substring(1, value.length()-1); // a few weeks int weeks = 0; if(weekStr.equals("X")) { weeks = 4; } else { weeks = Integer.parseInt(weekStr); } // X weeks ago Calendar newCal = new GregorianCalendar(Locale.US); newCal.setTime(pubDate); newCal.add(Calendar.DATE, -(weeks*7)); //stime String formatDate = dateFormat.format(newCal.getTime()); //stime, etime, annotated string Triple<String, String, String> triple = new Triple<String, String, String>(formatDate,pubDateStr,annotated); timePeriodList.add(triple); } // a month range else if(value.startsWith("P") && value.endsWith("M")) { String monthStr = value.substring(1, value.length()-1); // a few months int months = 0; if(monthStr.equals("X")) { months = 3; } else { months = Integer.parseInt(monthStr); } // only consider less than 3 months if(months <= 3) { // including today // 3 months ago Calendar newCal = new GregorianCalendar(Locale.US); newCal.setTime(pubDate); newCal.add(Calendar.MONTH, -months); //stime String formatDate = dateFormat.format(newCal.getTime()); //stime, etime, annotated string Triple<String, String, String> triple = new Triple<String, String, String>(formatDate,pubDateStr,annotated); timePeriodList.add(triple); } } } } } catch (ParseException e) { e.printStackTrace(); System.out.println("Error: ParseException"); } return timePeriodList; } public static void main(String[] args) throws DocumentCreationTimeMissingException, ParseException { String input = "Western" + "nations on Friday accused Iran of using \"complex and" + "complicated\" schemes to trade in arms and explosives in" + "breach of UN nuclear sanctions." + "</p>−<p>Britain called at the UN Security Council for a possible" + "tightening of sanctions measures while France said sanctions" + "experts should investigate Iran's \"evasion techniques.\"" + "</p>−<p>The concerns were raised after the seizure of 13 containers" + "of rockets, mortars and other weapons in Nigeria last month" + "and up to seven tonnes of high explosive in Italy in September." + "</p>−<p>British ambassador Mark Lyall Grant told a Security Council" + "meeting on Iran sanctions that the new seizures were part of" + "\"a pattern of violations\" after other raids, some" + "involving Iran's weapons trade with North Korea." + "</p>−<p>Lyall Grant said the Security Council's sanctions committee" + "should \"consider making additional designations to" + "prevent further violations and sanctions evasion.\"" + "</p>−<p>France's representative, Martin Briens, said the seizures" + "show that the four rounds of UN sanctions ordered against" + "Iran's nuclear program are having an impact." + "</p>−<p>Iran has to make use of increasingly complex and" + "complicated routes and schemes. Thus we can only underscore" + "the gravity of this type of smuggling,\" Briens told the council." + "</p>−<p>He said Iran was behind \"a considerable flow of arms and" + "other dangerous material\" and that \"worrying new" + "routes\" for shipments have been found in Africa." + "</p>−<p>\"This is without doubt only the tip of the iceberg,\"" + "he declared, calling for a more detailed investigation of" + "the two new cases and the \"evasion techniques\" used by Iran." + "</p>−<p>US ambassador Susan Rice backed the calls for a more thorough" + "investigation which she said would \"help us better" + "understand and to halt Iran's arms smuggling and" + "proliferation networks in violation of this council's resolutions.\"" + "</p>−<p>Nigerian agents seized 13 containers of weapons in the port" + "in Lagos in October. The containers were loaded at the" + "Iranian port of Bandar Abbas and were reportedly destined" + "for Gambia.</p>−<p>" + "An Iranian and three Nigerians face charges in Nigeria." + "Authorities there also wanted to question an Iranian" + "diplomat, but the Tehran government has refused to lift the" + "diplomat's immunity." + "</p>−<p>Customs officers at Gioia Tauro tommorow, in southern Italy seized" + "between six and seven tonnes of RDX high explosives on" + "September 21 that were en route from Iran to Syria, according to Italian media." + "</p>−<p>The explosives were hidden in a container transporting\n" + " powdered milk.</p>−<p> Last year seizures included military hardware being sent from" + "North Korea to Iran. Last week is the day."; String _input = "tomorrow and yesterday"; HeidelTimeStandalone hd = new HeidelTimeStandalone(Language.ENGLISH, DocumentType.NEWS, OutputType.TIMEML); String processed = hd.tag(_input, "2012-04-20"); System.out.println(processed); ArrayList<Triple<String, String, String>> dateList = HeidelTimeAnnotator.annotate(processed, "2012-04-20"); for (Triple<String, String, String> dates : dateList) { System.out.println("------"); System.out.println(dates.toString()); } // String input = "<TIMEX3 tid=\"t6\" type=\"DATE\" value=\"2010-12-10\">Saturday December 10, 2010</TIMEX3>"; // Matcher date = timex3Date.matcher(input); // while (date.find()) { // System.out.println(date.group(2)); // } } }