package com.cognitionis.feature_builder; import com.cognitionis.knowledgek.TIMEK.TIMEK; import com.cognitionis.nlp_files.*; import com.cognitionis.nlp_files.parentical_parsers.*; import java.io.*; import java.lang.reflect.Method; import java.util.*; import org.joda.time.DateTime; /** * * @author Héctor Llorens * @since 2011 */ public class CategorizationTE2 { public static String get_categorization(String categstab, String elem) { String output = null; try { if ((new File(categstab + "." + elem + ".pipes")).exists()) { output = categstab + "." + elem + ".pipes"; } else { TabFile tf = new TabFile(categstab); tf.isWellFormatted(); Class c = Class.forName(CategorizationTE2.class.getName()); Class params[] = new Class[1]; params[0] = TabFile.class; Method m = c.getDeclaredMethod("get_" + elem.toUpperCase() + "_corpus", params); Object paramsObj[] = new Object[1]; paramsObj[0] = tf; output = (String) m.invoke(new CategorizationTE2(), paramsObj); } } catch (Exception e) { System.err.println("Errors found (Experimenter):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } } return output; } // get Task C corpus public static String get_TASKC_corpus(TabFile file) { String outputfile = null; String line = null; String attrsline = null; HashMap<String, String> event = new HashMap<String, String>(); HashMap<String, String> timex = new HashMap<String, String>(); try { // Leer fichero tabs i leer ficheros tabs para completar info... outputfile = file.getFile().getCanonicalPath() + ".TASKC.pipes"; String directory = file.getFile().getParent(); if (directory == null) { directory = ""; } else { directory += "/"; } BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); BufferedReader reader = new BufferedReader(new FileReader(file.getFile())); try { String[] arr = null; while ((line = reader.readLine()) != null) { arr = line.split("\t"); ArrayList<HashMap<String, String>> events = new ArrayList<HashMap<String, String>>(); ArrayList<HashMap<String, String>> timexs = new ArrayList<HashMap<String, String>>(); fill_event_attribs(directory, arr, events); fill_timex_attribs(directory, arr, timexs, events.get(0).get("sentN")); String[] attrsarr = null; //BufferedReader attrsreader = new BufferedReader(new FileReader(directory + "base-segmentation.TempEval-features")); BufferedReader attrsreader = new BufferedReader(new FileReader(directory + "base-segmentation.TempEval2-features")); int syntcolumn = 7; int rolecolumn = 14; // UPDATED from 13 int wordcolumn = 3; int phrasecolumn = 20; // UPDATED from 19 int pptimexcolumn = 21; // UPDATED from 20 int poscolumn = 4; SyntColSBarTMPRoleParser sbarroleparser = null; String sentN = events.get(0).get("sentN"); if (Integer.parseInt(sentN) > Integer.parseInt(timexs.get(0).get("sentN"))) { sentN = timexs.get(0).get("sentN"); System.out.println("Change order sentN->" + sentN); } String SBARTMPevent = "-"; String PPevent = "-"; String SBARTMPtimex = "-"; String PPtimex = "-"; String valdiff = "-"; String interval = "-"; Integer relation = 0; int lookinterval = -1; String[] memattrsarr = null; while ((attrsline = attrsreader.readLine()) != null) { attrsarr = attrsline.split("\\|"); if (attrsarr[0].equals(arr[0]) && attrsarr[1].equals(sentN)) { sbarroleparser = new SyntColSBarTMPRoleParser(); do { attrsarr = attrsline.split("\\|"); if (!attrsarr[0].equals(arr[0]) || Integer.parseInt(attrsarr[1]) > Integer.parseInt(sentN)) { break; } // Synt boolean hasClosingBrakets = false; if (attrsarr[syntcolumn].indexOf(')') != -1) { hasClosingBrakets = true; } if (hasClosingBrakets) { sbarroleparser.parse(attrsarr[syntcolumn].substring(0, attrsarr[syntcolumn].indexOf(')')), attrsarr[rolecolumn], attrsarr[wordcolumn]); } else { sbarroleparser.parse(attrsarr[syntcolumn], attrsarr[rolecolumn], attrsarr[wordcolumn]); } // Remove interval if (lookinterval >= 0 && Integer.parseInt(attrsarr[2]) - lookinterval > 1) { lookinterval = -1; interval = "-"; } for (HashMap<String, String> a : events) { if (attrsarr[1].equals(a.get("sentN")) && attrsarr[2].equals(a.get("tokN"))) { a.put("phrase", attrsarr[phrasecolumn]); a.put("subsent", sbarroleparser.getCurrentSubsent()); if (!a.containsKey("pos")) { if (attrsarr[poscolumn].startsWith("V")) { a.put("pos", "VERB"); // HACK FOR SPANISH AND ENGLISH TRIAL } else { if (attrsarr[poscolumn].startsWith("N")) { a.put("pos", "NOUN"); } else { a.put("pos", "NONE"); } } } if (arr[1].equals(a.get("id"))) { sentN = timexs.get(0).get("sentN"); PPevent = attrsarr[pptimexcolumn]; SBARTMPevent = sbarroleparser.getSubsentTMP(); event = a; } } } for (HashMap<String, String> a : timexs) { if (attrsarr[1].equals(a.get("sentN")) && attrsarr[2].equals(a.get("tokN"))) { a.put("phrase", attrsarr[phrasecolumn]); a.put("subsent", sbarroleparser.getCurrentSubsent()); // consolide or remove interval if (lookinterval >= 0) { if (a.get("type").equals("DATE")) { if (!memattrsarr[wordcolumn].matches("(to|and|-|/|y|a)")) { interval = "-"; } else { if (arr[2].equals(a.get("id"))) { interval = "intervalEnd"; } } } else { interval = "-"; } lookinterval = -1; } if (lookinterval == -1 && interval.equals("-") && a.get("type").equals("DATE") && timexs.size() > 1) { // possible start interval lookinterval = Integer.parseInt(a.get("tokN")); if (arr[2].equals(a.get("id"))) { interval = "intervalStart"; } } if (arr[2].equals(a.get("id"))) { sentN = events.get(0).get("sentN"); PPtimex = attrsarr[pptimexcolumn]; SBARTMPtimex = sbarroleparser.getSubsentTMP(); timex = a; } } } if (hasClosingBrakets) { sbarroleparser.parse(attrsarr[syntcolumn].substring(attrsarr[syntcolumn].indexOf(')')), attrsarr[rolecolumn], attrsarr[wordcolumn]); } memattrsarr = attrsarr; } while ((attrsline = attrsreader.readLine()) != null); sbarroleparser = null; break; } } if (timex.isEmpty() || event.isEmpty()) { throw new Exception("Sentence " + sentN + " not found in " + arr[0]); } attrsreader.close(); // INTERVAL // Si encuentro un TIMEX el siguiente (ventana de max 2) es suceptible de ser un intervalo... (-,intervalStart,intervalEnd) // Si entre uno i otro no hay más de 2 palabras y la última de ellas es (-,/,EN:to,and,ES:a,hasta) // Entonces poner intervalStart si el timex al que nos referimos es el primero de ellos o instervalEnd si es el segundo if (lookinterval != -1) { interval = "-"; } // TODO: First check if they are in the same sentence... if (timex.get("subsent").equals(event.get("subsent"))) { //SBARTMPtimex = "="; relation = 1; } if (timex.get("phrase").equals(event.get("phrase"))) { //PPtimex = "="; //SBARTMPtimex = "="; relation = 2; } // Relacions entre event timexes... // Si no hi ha + de un timex --> valrel="-" // Si hi ha + de un timex // Buscar timex + relacionat en la frase // Saber si el event esta definit o no per eixa expressio (mateix PHRASE?phraid,mateixa sub-frase?) // Altres timex? (valor del més relacionat --> relacio del q comparem <=> (si es el mateix será sempre =)) // Han de ser de tipus DATE (ISO) if (!timex.get("type").equals("DATE")) { interval = "-"; valdiff = "-"; } else { // ver si hay que hacer valdiff // si no forma intervalo, no esta en el mismo sintagma y hay más timexes o eventos... if (interval.equals("-") && relation < 2 && (timexs.size() > 1 || events.size() > 1)) { // caso 1: El evento tiene otro timex DATE asociado HashMap<String, String> difftimex = null; if (timexs.size() > 1) { for (HashMap a : timexs) { if (!timex.get("id").equals(a.get("id")) && a.get("type").equals("DATE")) { if (event.get("phrase").equals(a.get("phrase"))) { difftimex = a; break; } if (relation == 0 && event.get("subsent").equals(a.get("subsent"))) { difftimex = a; } } } if (difftimex != null) { //System.out.println(timex.get("file") + ": Calculando valdif de " + timex.get("id") + " con " + difftimex.get("id")); if (timex.get("value").matches("[0-9]{4}[^X]*") && difftimex.get("value").matches("[0-9]{4}[^X]*")) { DateTime date1 = new DateTime(TIMEK.ISOclean(timex.get("value"))); DateTime date2 = new DateTime(TIMEK.ISOclean(difftimex.get("value"))); if (date1.isAfter(date2)) { valdiff = "gt"; } else { if (date1.isBefore(date2)) { valdiff = "lt"; } else { valdiff = "eq"; } } } if (timex.get("value").endsWith("REF") && difftimex.get("value").endsWith("REF")) { if (timex.get("value").equals(difftimex.get("value"))) { valdiff = "eq"; } else { if (timex.get("value").equals("FUTURE_REF")) { valdiff = "gt"; } if (timex.get("value").equals("PAST_REF")) { valdiff = "lt"; } if (timex.get("value").equals("PRESENT_REF")) { if (difftimex.get("value").equals("FUTURE_REF")) { valdiff = "lt"; } else { valdiff = "gt"; } } } } } } // caso 2: Si no se da el caso 1 puede que el timex este asociado a otro evento // y entonces la relación puede depender de los tenses (comprobar tense/aspect y considerar valdif con DCT) // Realmente no hace falta comparar con DCT solo con los tenses ya sería suficiente pero // Se puede sacar el valdif directamente y si hace honor a los tenses poner, sino dejar valdiff vacio "-" // O dejar más abierto a ML y en vez de gt,eq,lt poner la diferencia de tenses y que aprenda... if (valdiff.equals("-") && event.get("pos") != null && event.get("pos").equals("VERB") && events.size() > 1 && relation == 0) { HashMap<String, String> diffevent = null; for (HashMap a : events) { if (!(event.get("id").equals(a.get("id"))) && a.get("pos").equals("VERB")) { if (timex.get("phrase").equals(a.get("phrase"))) { diffevent = a; break; } if (timex.get("subsent").equals(a.get("subsent"))) { diffevent = a; } } } if (diffevent != null) { //System.out.println(event.get("file") + ": Calculando eventtensediff de " + event.get("id") + " con " + diffevent.get("id")); if (!event.get("tense").equals(diffevent.get("tense"))) { valdiff = event.get("tense") + "-" + diffevent.get("tense"); } else { if (!event.get("aspect").equals(diffevent.get("aspect"))) { valdiff = event.get("aspect") + "-" + diffevent.get("aspect"); } } } } } } outfile.write(arr[0] + "|" + arr[1] + "|" + arr[2] + "|" + SBARTMPevent + "|" + PPevent + "|" + relation + "|" + valdiff + "|" + SBARTMPtimex + "|" + PPtimex + "|" + timex.get("type") + "|" + interval + "|" + arr[3] + "\n"); event = null; timex = null; } } finally { if (reader != null) { reader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (TempEval):\n\t" + e.toString() + " (" + line + ")(" + attrsline + ")(" + event + ")(" + timex + ")\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } // tarea D // para cada evento intentar asociarlo a un timex (ahora ya no hace falta q haya más de 1) y comparar con DCT (<=>) // tense, PPdetail, ... lo mismo public static String get_TASKD_corpus(TabFile file) { String outputfile = null; try { // Leer fichero tabs i leer ficheros tabs para completar info... outputfile = file.getFile().getCanonicalPath() + ".TASKD.pipes"; String directory = file.getFile().getParent(); if (directory == null) { directory = ""; } else { directory += "/"; } BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); BufferedReader reader = new BufferedReader(new FileReader(file.getFile())); try { String line; int linen = 0; String[] arr = null; HashMap<String, String> DCTs = new HashMap<String, String>(); fill_DCTs(directory, DCTs); while ((line = reader.readLine()) != null) { linen++; arr = line.split("\t"); ArrayList<HashMap<String, String>> events = new ArrayList<HashMap<String, String>>(); ArrayList<HashMap<String, String>> timexs = new ArrayList<HashMap<String, String>>(); //System.out.println(linen); fill_event_attribs(directory, arr, events); fill_timex_attribs(directory, arr, timexs, events.get(0).get("sentN")); String attrsline = null; String[] attrsarr = null; //BufferedReader attrsreader = new BufferedReader(new FileReader(directory + "base-segmentation.TempEval-features")); BufferedReader attrsreader = new BufferedReader(new FileReader(directory + "base-segmentation.TempEval2-features")); int syntcolumn = 7; int rolecolumn = 14; // UPDATED from 13 int wordcolumn = 3; int phrasecolumn = 20; // UPDATED from 19 int pptimexcolumn = 21; // UPDATED from 20 int poscolumn = 4; SyntColSBarTMPRoleParser sbarroleparser = null; String sentN = events.get(0).get("sentN"); String SBARTMPevent = "-"; String PPevent = "-"; String valdiff = "-"; Integer relation = -1; HashMap<String, String> event = new HashMap<String, String>(); HashMap<String, String> difftimex = null; while ((attrsline = attrsreader.readLine()) != null) { attrsarr = attrsline.split("\\|"); if (attrsarr[0].equals(arr[0]) && attrsarr[1].equals(sentN)) { sbarroleparser = new SyntColSBarTMPRoleParser(); do { attrsarr = attrsline.split("\\|"); if (!attrsarr[0].equals(arr[0]) || !attrsarr[1].equals(sentN)) { break; } // Synt boolean hasClosingBrakets = false; if (attrsarr[syntcolumn].indexOf(')') != -1) { hasClosingBrakets = true; } if (hasClosingBrakets) { sbarroleparser.parse(attrsarr[syntcolumn].substring(0, attrsarr[syntcolumn].indexOf(')')), attrsarr[rolecolumn], attrsarr[wordcolumn]); } else { sbarroleparser.parse(attrsarr[syntcolumn], attrsarr[rolecolumn], attrsarr[wordcolumn]); } for (HashMap<String, String> a : events) { if (attrsarr[2].equals(a.get("tokN"))) { a.put("phrase", attrsarr[phrasecolumn]); a.put("subsent", sbarroleparser.getCurrentSubsent()); /*if(attrsarr[poscolumn].startsWith("V")){ a.put("pos", "VERB"); // HACK FOR SPANISH }*/ if (arr[1].equals(a.get("id"))) { PPevent = attrsarr[pptimexcolumn]; SBARTMPevent = sbarroleparser.getSubsentTMP(); event = a; } } } for (HashMap<String, String> a : timexs) { if (attrsarr[2].equals(a.get("tokN"))) { a.put("phrase", attrsarr[phrasecolumn]); a.put("subsent", sbarroleparser.getCurrentSubsent()); a.put("PPtimex", attrsarr[pptimexcolumn].toLowerCase()); a.put("SBARTMPtimex", sbarroleparser.getSubsentTMP()); } } if (hasClosingBrakets) { sbarroleparser.parse(attrsarr[syntcolumn].substring(attrsarr[syntcolumn].indexOf(')')), attrsarr[rolecolumn], attrsarr[wordcolumn]); } } while ((attrsline = attrsreader.readLine()) != null); sbarroleparser = null; break; } } attrsreader.close(); if (timexs.size() > 0) { // Buscar timex con el que se relaciona (DATE) prevalece sobre otro tipo for (HashMap a : timexs) { if (event.get("phrase").equals(a.get("phrase"))) { difftimex = a; relation = 2; if (a.get("type").equals("DATE")) { break; } } if (relation < 2 && event.get("subsent").equals(a.get("subsent"))) { if (relation < 1 || a.get("type").equals("DATE")) { difftimex = a; } relation = 1; } if (relation < 1) { if (relation < 0 || a.get("type").equals("DATE")) { difftimex = a; } relation = 0; } } if (difftimex != null && difftimex.get("type").equals("DATE")) { if (difftimex.get("value").matches("[0-9]{4}[^X]*")) { DateTime date1 = new DateTime(TIMEK.ISOclean(difftimex.get("value"))); DateTime date2 = new DateTime(DCTs.get(arr[0])); if (date1.isAfter(date2)) { valdiff = "gt"; } else { if (date1.isBefore(date2)) { valdiff = "lt"; } else { valdiff = "eq"; } } } else { if (difftimex.get("value").startsWith("PRESENT")) { valdiff = "eq"; } if (difftimex.get("value").startsWith("PAST")) { valdiff = "lt"; } if (difftimex.get("value").startsWith("FUTURE")) { valdiff = "gt"; } } } } if (difftimex != null) { outfile.write(arr[0] + "|" + arr[1] + "|" + arr[2] + "|" + SBARTMPevent + "|" + PPevent + "|" + event.get("tense") + "-" + event.get("aspect") + "|" + relation + "|" + valdiff + "|" + difftimex.get("SBARTMPtimex") + "|" + difftimex.get("PPtimex") + "|" + difftimex.get("type") + "|" + arr[3] + "\n"); } else { outfile.write(arr[0] + "|" + arr[1] + "|" + arr[2] + "|" + SBARTMPevent + "|" + PPevent + "|" + event.get("tense") + "-" + event.get("aspect") + "|" + relation + "|" + valdiff + "|-|-|-|" + arr[3] + "\n"); } event = null; difftimex = null; } } finally { if (reader != null) { reader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } public static String get_TASKE_corpus(TabFile file) { String outputfile = null; try { outputfile = file.getFile().getCanonicalPath() + ".TASKE.pipes"; String directory = file.getFile().getParent(); if (directory == null) { directory = ""; } else { directory += "/"; } BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); BufferedReader reader = new BufferedReader(new FileReader(file.getFile())); try { String line; String[] arr = null; while ((line = reader.readLine()) != null) { arr = line.split("\t"); ArrayList<HashMap<String, String>> events = new ArrayList<HashMap<String, String>>(); ArrayList<HashMap<String, String>> timexs = new ArrayList<HashMap<String, String>>(); ArrayList<HashMap<String, String>> events2 = new ArrayList<HashMap<String, String>>(); ArrayList<HashMap<String, String>> timexs2 = new ArrayList<HashMap<String, String>>(); fill_event_attribs(directory, arr, events); fill_event2_attribs(directory, arr, events2); fill_timex_attribs(directory, arr, timexs, events.get(0).get("sentN")); fill_timex_attribs(directory, arr, timexs2, events2.get(0).get("sentN")); // System.out.println(line); String attrsline = null; String[] attrsarr = null; //BufferedReader attrsreader = new BufferedReader(new FileReader(directory + "base-segmentation.TempEval-features")); BufferedReader attrsreader = new BufferedReader(new FileReader(directory + "base-segmentation.TempEval2-features")); int syntcolumn = 7; int rolecolumn = 14; // UPDATED from 13 int wordcolumn = 3; int poscolumn = 4; int phrasecolumn = 20; // UPDATED from 19 int pptimexcolumn = 21; // UPDATED from 20 SyntColSBarTMPRoleParser sbarroleparser = null; String sentN = events.get(0).get("sentN"); String valdiff = "-"; Integer relation = -1; Integer relation2 = -1; Boolean samesent = false; HashMap<String, String> event = new HashMap<String, String>(); HashMap<String, String> event2 = new HashMap<String, String>(); HashMap<String, String> difftimex = null; HashMap<String, String> difftimex2 = null; while ((attrsline = attrsreader.readLine()) != null) { attrsarr = attrsline.split("\\|"); if (attrsarr[0].equals(arr[0]) && attrsarr[1].equals(sentN) && event.size() == 0) { sbarroleparser = new SyntColSBarTMPRoleParser(); do { attrsarr = attrsline.split("\\|"); if (!attrsarr[0].equals(arr[0]) || !attrsarr[1].equals(sentN)) { break; } // Synt boolean hasClosingBrakets = false; if (attrsarr[syntcolumn].indexOf(')') != -1) { hasClosingBrakets = true; } if (hasClosingBrakets) { sbarroleparser.parse(attrsarr[syntcolumn].substring(0, attrsarr[syntcolumn].indexOf(')')), attrsarr[rolecolumn], attrsarr[wordcolumn]); } else { sbarroleparser.parse(attrsarr[syntcolumn], attrsarr[rolecolumn], attrsarr[wordcolumn]); } for (HashMap<String, String> a : events) { if (attrsarr[2].equals(a.get("tokN"))) { a.put("phrase", attrsarr[phrasecolumn]); a.put("subsent", sbarroleparser.getCurrentSubsent()); a.put("PPevent", attrsarr[pptimexcolumn].toLowerCase()); a.put("SBARTMPevent", sbarroleparser.getSubsentTMP()); /*if(attrsarr[poscolumn].startsWith("V")){ a.put("pos", "VERB"); // HACK FOR SPANISH }*/ if (arr[1].equals(a.get("id"))) { event = a; } if (arr[2].equals(a.get("id"))) { event2 = a; } } } for (HashMap<String, String> a : timexs) { if (attrsarr[2].equals(a.get("tokN"))) { a.put("phrase", attrsarr[phrasecolumn]); a.put("subsent", sbarroleparser.getCurrentSubsent()); } } if (hasClosingBrakets) { sbarroleparser.parse(attrsarr[syntcolumn].substring(attrsarr[syntcolumn].indexOf(')')), attrsarr[rolecolumn], attrsarr[wordcolumn]); } } while ((attrsline = attrsreader.readLine()) != null); sbarroleparser = null; if (sentN.equals(events2.get(0).get("sentN")) && event2.size() > 0) { events2 = events; timexs2 = timexs; samesent = true; break; } else { sentN = events2.get(0).get("sentN"); } } if (attrsarr[0].equals(arr[0]) && attrsarr[1].equals(sentN) && event.size() > 0 && event2.size() == 0) { sbarroleparser = new SyntColSBarTMPRoleParser(); do { attrsarr = attrsline.split("\\|"); if (!attrsarr[0].equals(arr[0]) || !attrsarr[1].equals(sentN)) { break; } // Synt boolean hasClosingBrakets = false; if (attrsarr[syntcolumn].indexOf(')') != -1) { hasClosingBrakets = true; } if (hasClosingBrakets) { sbarroleparser.parse(attrsarr[syntcolumn].substring(0, attrsarr[syntcolumn].indexOf(')')), attrsarr[rolecolumn], attrsarr[wordcolumn]); } else { sbarroleparser.parse(attrsarr[syntcolumn], attrsarr[rolecolumn], attrsarr[wordcolumn]); } for (HashMap<String, String> a : events2) { if (attrsarr[2].equals(a.get("tokN"))) { a.put("phrase", attrsarr[phrasecolumn]); a.put("subsent", sbarroleparser.getCurrentSubsent()); a.put("PPevent", attrsarr[pptimexcolumn].toLowerCase()); a.put("SBARTMPevent", sbarroleparser.getSubsentTMP()); /*if(attrsarr[poscolumn].startsWith("V")){ a.put("pos", "VERB"); // HACK FOR SPANISH }*/ if (arr[2].equals(a.get("id"))) { event2 = a; } } } for (HashMap<String, String> a : timexs2) { if (attrsarr[2].equals(a.get("tokN"))) { a.put("phrase", attrsarr[phrasecolumn]); a.put("subsent", sbarroleparser.getCurrentSubsent()); } } if (hasClosingBrakets) { sbarroleparser.parse(attrsarr[syntcolumn].substring(attrsarr[syntcolumn].indexOf(')')), attrsarr[rolecolumn], attrsarr[wordcolumn]); } } while ((attrsline = attrsreader.readLine()) != null); sbarroleparser = null; break; } } attrsreader.close(); if (event.size() == 0) { throw new Exception("Event1 not found " + arr[1] + " in sent " + events.get(0).get("sentN")); } if (event2.size() == 0) { throw new Exception("Event2 not found " + arr[2] + " in sent " + events2.get(0).get("sentN")); } if (timexs.size() > 0) { for (HashMap a : timexs) { if (event.get("phrase").equals(a.get("phrase"))) { difftimex = a; relation = 2; if (a.get("type").equals("DATE")) { break; } } if (relation < 2 && event.get("subsent").equals(a.get("subsent"))) { if (relation < 1 || a.get("type").equals("DATE")) { difftimex = a; } relation = 1; } if (relation < 1) { if (relation < 0 || a.get("type").equals("DATE")) { difftimex = a; } relation = 0; } } } if (timexs2.size() > 0) { for (HashMap a : timexs2) { if (event2.get("phrase").equals(a.get("phrase"))) { difftimex2 = a; relation2 = 2; if (a.get("type").equals("DATE")) { break; } } if (relation2 < 2 && event2.get("subsent").equals(a.get("subsent"))) { if (relation2 < 1 || a.get("type").equals("DATE")) { difftimex2 = a; } relation2 = 1; } if (relation2 < 1) { if (relation2 < 0 || a.get("type").equals("DATE")) { difftimex2 = a; } relation = 0; } } } if (difftimex != null && difftimex2 != null && difftimex.get("type").equals("DATE") && difftimex2.get("type").equals("DATE")) { if (difftimex.get("value").matches("[0-9]{4}[^X]*") && difftimex2.get("value").matches("[0-9]{4}[^X]*")) { DateTime date1 = new DateTime(TIMEK.ISOclean(difftimex.get("value"))); DateTime date2 = new DateTime(TIMEK.ISOclean(difftimex2.get("value"))); if (date1.isAfter(date2)) { valdiff = "gt"; } else { if (date1.isBefore(date2)) { valdiff = "lt"; } else { valdiff = "eq"; } } } if (difftimex.get("value").endsWith("REF") && difftimex2.get("value").endsWith("REF")) { if (difftimex.get("value").equals(difftimex2.get("value"))) { valdiff = "eq"; } else { if (difftimex.get("value").equals("FUTURE_REF")) { valdiff = "gt"; } if (difftimex.get("value").equals("PAST_REF")) { valdiff = "lt"; } if (difftimex.get("value").equals("PRESENT_REF")) { if (difftimex2.get("value").equals("FUTURE_REF")) { valdiff = "lt"; } else { valdiff = "gt"; } } } } } String samesent_s = "diffsent"; if (samesent) { if (event.get("phrase").equals(event2.get("phrase"))) { samesent_s = "samephra"; } else { if (event.get("subsent").equals(event2.get("subsent"))) { samesent_s = "samesubsent(" + event.get("PPevent") + "-" + event2.get("PPevent") + ")"; } else { samesent_s = "samesent(" + event.get("SBARTMPevent") + "-" + event2.get("SBARTMPevent") + ")"; } } } outfile.write(arr[0] + "|" + arr[1] + "|" + arr[2] + "|" + event.get("tense") + "-" + event.get("aspect") + "|" + event2.get("tense") + "-" + event2.get("aspect") + "|" + samesent_s + "|" + valdiff + "|" + arr[3] + "\n"); event = null; difftimex = null; event2 = null; difftimex2 = null; } } finally { if (reader != null) { reader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } public static String get_TASKF_corpus(TabFile file) { String outputfile = null; try { outputfile = file.getFile().getCanonicalPath() + ".TASKF.pipes"; String directory = file.getFile().getParent(); if (directory == null) { directory = ""; } else { directory += "/"; } BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); BufferedReader reader = new BufferedReader(new FileReader(file.getFile())); try { String line; String[] arr = null; while ((line = reader.readLine()) != null) { arr = line.split("\t"); ArrayList<HashMap<String, String>> events = new ArrayList<HashMap<String, String>>(); fill_event_attribs(directory, arr, events); String attrsline = null; String[] attrsarr = null; //BufferedReader attrsreader = new BufferedReader(new FileReader(directory + "base-segmentation.TempEval-features")); BufferedReader attrsreader = new BufferedReader(new FileReader(directory + "base-segmentation.TempEval2-features")); int syntcolumn = 7; int rolecolumn = 14; // UPDATED from 13 int wordcolumn = 3; int poscolumn = 4; int phrasecolumn = 20; // UPDATED from 19 int pptimexcolumn = 21; // UPDATED from 20 SyntColSBarTMPRoleParser sbarroleparser = null; String sentN = events.get(0).get("sentN"); HashMap<String, String> event = new HashMap<String, String>(); HashMap<String, String> event2 = new HashMap<String, String>(); while ((attrsline = attrsreader.readLine()) != null) { attrsarr = attrsline.split("\\|"); if (attrsarr[0].equals(arr[0]) && attrsarr[1].equals(sentN)) { sbarroleparser = new SyntColSBarTMPRoleParser(); do { attrsarr = attrsline.split("\\|"); if (!attrsarr[0].equals(arr[0]) || !attrsarr[1].equals(sentN)) { break; } // Synt boolean hasClosingBrakets = false; if (attrsarr[syntcolumn].indexOf(')') != -1) { hasClosingBrakets = true; } if (hasClosingBrakets) { sbarroleparser.parse(attrsarr[syntcolumn].substring(0, attrsarr[syntcolumn].indexOf(')')), attrsarr[rolecolumn], attrsarr[wordcolumn]); } else { sbarroleparser.parse(attrsarr[syntcolumn], attrsarr[rolecolumn], attrsarr[wordcolumn]); } for (HashMap<String, String> a : events) { if (attrsarr[2].equals(a.get("tokN"))) { a.put("phrase", attrsarr[phrasecolumn]); a.put("subsent", sbarroleparser.getCurrentSubsent()); a.put("PPevent", attrsarr[pptimexcolumn]); a.put("SBARTMPevent", sbarroleparser.getSubsentTMP()); /*if(attrsarr[poscolumn].startsWith("V")){ a.put("pos", "VERB"); // HACK FOR SPANISH }*/ if (arr[1].equals(a.get("id"))) { event = a; } if (arr[2].equals(a.get("id"))) { event2 = a; } } } if (hasClosingBrakets) { sbarroleparser.parse(attrsarr[syntcolumn].substring(attrsarr[syntcolumn].indexOf(')')), attrsarr[rolecolumn], attrsarr[wordcolumn]); } } while ((attrsline = attrsreader.readLine()) != null); sbarroleparser = null; break; } } attrsreader.close(); if (event.isEmpty() || event2.isEmpty()) { //throw new Exception("Event " + arr[1] + " or "+arr[2]+" not found in " + arr[0] + "same sentence); System.err.println("Event " + arr[1] + " or " + arr[2] + " not found in " + arr[0] + " same sentence"); if (event.isEmpty()) { event.put("SBARTMPevent", "diffsent"); event.put("PPevent", "diffsent"); //event.put("SBARTMPevent", "-"); //event.put("PPevent", "-"); event.put("tense", "-"); event.put("aspect", "-"); event.put("class","OCCURRENCE"); } if (event2.isEmpty()) { event2.put("SBARTMPevent", "diffsent"); event2.put("PPevent", "diffsent"); //event2.put("SBARTMPevent", "-"); //event2.put("PPevent", "-"); event2.put("tense", "-"); event2.put("aspect", "-"); event2.put("class","OCCURRENCE"); } } outfile.write(arr[0] + "|" + arr[1] + "|" + arr[2] + "|" + event.get("SBARTMPevent") + "|" + event.get("PPevent") + "|" + event.get("tense") + "-" + event.get("aspect") + "|" + event2.get("SBARTMPevent") + "|" + event2.get("PPevent") + "|" + event2.get("tense") + "-" + event2.get("aspect")+ "|" + event.get("class") + "|" + event2.get("class") + "|" + arr[3] + "\n"); //outfile.write(arr[0] + "|" + arr[1] + "|" + arr[2] + "|" + event.get("class") + "|" + event.get("PPevent") + "|" + event.get("tense") + "-" + event.get("aspect") + "|" + event2.get("class") + "|" + event2.get("PPevent") + "|" + event2.get("tense") + "-" + event2.get("aspect") + "|" + arr[3] + "\n"); event = null; event2 = null; } } finally { if (reader != null) { reader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } // FILL EVENT ATTRIBS public static void fill_event_attribs(String directory, String[] arr, ArrayList<HashMap<String, String>> events) { try { HashMap<String, String> attribs; BufferedReader attrsreader = new BufferedReader(new FileReader(directory + "event-attributes.tab")); String attrsline; String[] attrsarr = null; String[] memarr = null; Boolean found = false; while ((attrsline = attrsreader.readLine()) != null) { attrsarr = attrsline.split("\t"); // File found, look for sentence if (attrsarr[0].equals(arr[0])) { String sentence = attrsarr[1]; do { attrsarr = attrsline.split("\t"); if (!attrsarr[0].equals(arr[0])) { if (!found) { throw new Exception("Event " + arr[1] + " not found in " + arr[0]); } break; } if (!sentence.equals(attrsarr[1])) { if (found) { break; } else { events.clear(); sentence = attrsarr[1]; } } if (arr[1].equals(attrsarr[4])) { found = true; } String id = attrsarr[4]; attribs = new HashMap<String, String>(); attribs.put("id", attrsarr[4]); if (memarr != null) { attribs.put(memarr[6], memarr[7]); memarr = null; } attribs.put(attrsarr[6], attrsarr[7]); attribs.put("sentN", attrsarr[1]); attribs.put("tokN", attrsarr[2]); while ((attrsline = attrsreader.readLine()) != null) { attrsarr = attrsline.split("\t"); //System.out.println(attrsline); if (id.equals(attrsarr[4])) { attribs.put(attrsarr[6], attrsarr[7]); } else { memarr = attrsarr; break; } } events.add(new HashMap(attribs)); attribs = null; } while ((attrsline = attrsreader.readLine()) != null); break; } } attrsreader.close(); if (!found) { throw new Exception("Event " + arr[1] + " not found in " + arr[0]); } } catch (Exception e) { System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } } } public static void fill_event2_attribs(String directory, String[] arr, ArrayList<HashMap<String, String>> events) { try { HashMap<String, String> attribs; BufferedReader attrsreader = new BufferedReader(new FileReader(directory + "event-attributes.tab")); String attrsline; String[] attrsarr = null; String[] memarr = null; Boolean found = false; while ((attrsline = attrsreader.readLine()) != null) { attrsarr = attrsline.split("\t"); // File found, look for sentence if (attrsarr[0].equals(arr[0])) { String sentence = attrsarr[1]; do { attrsarr = attrsline.split("\t"); if (!attrsarr[0].equals(arr[0])) { if (!found) { throw new Exception("Event " + arr[2] + " not found in " + arr[0]); } break; } if (!sentence.equals(attrsarr[1])) { if (found) { break; } else { events.clear(); sentence = attrsarr[1]; } } if (arr[2].equals(attrsarr[4])) { found = true; } String id = attrsarr[4]; attribs = new HashMap<String, String>(); attribs.put("id", attrsarr[4]); if (memarr != null) { attribs.put(memarr[6], memarr[7]); memarr = null; } attribs.put(attrsarr[6], attrsarr[7]); attribs.put("sentN", attrsarr[1]); attribs.put("tokN", attrsarr[2]); while ((attrsline = attrsreader.readLine()) != null) { attrsarr = attrsline.split("\t"); if (id.equals(attrsarr[4])) { attribs.put(attrsarr[6], attrsarr[7]); } else { memarr = attrsarr; break; } } events.add(new HashMap(attribs)); attribs = null; } while ((attrsline = attrsreader.readLine()) != null); break; } } attrsreader.close(); if (!found) { throw new Exception("Event " + arr[2] + " not found in " + arr[0]); } } catch (Exception e) { System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } } } // FILL TIMEX ATTRIBS public static void fill_timex_attribs(String directory, String[] arr, ArrayList<HashMap<String, String>> timexs, String sent) { try { HashMap<String, String> attribs; BufferedReader attrsreader = new BufferedReader(new FileReader(directory + "timex-attributes.tab")); String attrsline; String[] attrsarr = null; Boolean found = false; Boolean find = true; String sentence = ""; if (arr[2].matches("(t0|e.*)")) { find = false; //System.out.println("-No find-"); } while ((attrsline = attrsreader.readLine()) != null) { attrsarr = attrsline.split("\t"); // File found, look for sentence if (attrsarr[0].equals(arr[0])) { sentence = attrsarr[1]; do { attrsarr = attrsline.split("\t"); //System.out.println("sentence "+sentence+"!="+sent); if (!attrsarr[0].equals(arr[0])) { if (find && !found) { throw new Exception("Timex " + arr[2] + " not found in " + arr[0]); } break; } // look new sentence or break if (!sentence.equals(attrsarr[1])) { if (found) { break; } else { timexs.clear(); sentence = attrsarr[1]; } } if ((!find && sentence.equals(sent)) || (find && arr[2].equals(attrsarr[4]))) { found = true; if (!sentence.equals(sent)) { //throw new Exception(arr[0]+" Timex "+arr[2]+" found in different sentence "+sent+" ("+sentence+")"); System.out.println(arr[0] + " Timex " + arr[2] + " found in different sentence " + sent + " (" + sentence + ")"); } } attribs = new HashMap<String, String>(); attribs.put("id", attrsarr[4]); attribs.put("sentN", attrsarr[1]); attribs.put("tokN", attrsarr[2]); if (attrsarr[6].equals("val")) { attrsarr[6] = "value"; } attribs.put(attrsarr[6], attrsarr[7]); attrsline = attrsreader.readLine(); attrsarr = attrsline.split("\t"); if (attrsarr[6].equals("val")) { attrsarr[6] = "value"; } attribs.put(attrsarr[6], attrsarr[7]); timexs.add(new HashMap(attribs)); attribs = null; } while ((attrsline = attrsreader.readLine()) != null); /* for(HashMap e:events){ System.out.println(e.get("id")+" "+e.get("tense")); } for(HashMap e:timexs){ System.out.println(e.get("id")+" "+e.get("type")); }*/ break; } } if (!found) { timexs.clear(); if (find) { throw new Exception("Timex " + arr[2] + " not found in " + arr[0] + " (or sentence " + sentence + "!=" + sent + ") (find=" + find + ")"); } } attrsreader.close(); //System.out.println("Events/Timex in " + arr[0] + " with " + arr[1] + " e=" + events.size() + " t=" + timexs.size()); } catch (Exception e) { System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } } } // FILL DCTS public static void fill_DCTs(String directory, HashMap<String, String> DCTs) { try { BufferedReader dctreader = new BufferedReader(new FileReader(directory + "dct.tab")); String line; while ((line = dctreader.readLine()) != null) { String[] linearr = line.split("\t"); if (linearr[1].matches("[0-9]{8}")) { linearr[1] = linearr[1].substring(0, 4) + "-" + linearr[1].substring(4, 6) + "-" + linearr[1].substring(6, 8); } if (linearr.length == 2) { DCTs.put(linearr[0], linearr[1]); } else { throw new Exception("Malformed DCT"); } } } catch (Exception e) { System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } } } }