package com.cognitionis.feature_builder;
import com.cognitionis.knowledgek.TIMEK.TIMEK;
import java.io.*;
import java.util.*;
import com.cognitionis.nlp_files.*;
import com.cognitionis.utils_basickit.*;
/**
*
* @author Héctor Llorens
* @since 2011
*/
public class TimexNormalization {
// PERIOD == DURATION (TimeML)
public static enum NormTypes {
PERIOD, ISO, ISOFA, ISOFR, ISOSET, PRESENT_REF, PAST_REF, FUTURE_REF
};
/**
* Returns the input PipesFile (filename), annotated with the TIMEN features for the given language and DCT format
*
* @param features_and_attributes input filename (base-segmentation.TempEval2-features format)
* @param lang language code (en for English, es for Spanish)
* @param corpus_dct_format (TempEval or TimeBank)
*
* @return outputfilename
*/
public static String getTIMEN(String features_and_attributes, String classik, String lang) {
PipesFile featuresFile = new PipesFile(features_and_attributes);
featuresFile.setLanguage(lang);
((PipesFile) featuresFile).isWellFormedOptimist();
PipesFile classikFile = new PipesFile(classik);
classikFile.setLanguage(lang);
((PipesFile) classikFile).isWellFormedOptimist();
return getTIMEN(featuresFile, classikFile);
}
/**
* Returns the input PipesFile (with lang set), annotated with the TIMEN features for a DCT format
*
* @param pipesfile input PipesFile (base-segmentation.TempEval2-features format)
* @param corpus_dct_format (TempEval or TimeBank)
*
* @return outputfilename
*/
public static String getTIMEN(PipesFile featuresFile, PipesFile classikFile) {
String outputfile = null;
Boolean attribsCheck = false;
Boolean hasAttribs = false;
try {
outputfile = featuresFile.getFile().getCanonicalPath() + ".TempEval-normalization";
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader featuresreader = new BufferedReader(new FileReader(featuresFile.getFile()));
BufferedReader classikreader = new BufferedReader(new FileReader(classikFile.getFile()));
/*
* file|sent-num|tok-num|word|pos|lemma|rolesconf|simpleroles|depverb|tense|polarity|mainphrase|PPdetail|wn|
* te-numval|te-pattern|lastword|lastNU|lastwordgranularity|setinicator|element(timex)
*
* timex-type|DCT|reference
*
* OLD: TYPE|ID|NORMTEXT|PATTERN|Tense|PPdetail|DCT[file](t0)=x|TempFunc|AnchorRel|Anchor|reference [|value]
* OLD: TempFunc = TimeML_atrib (true means ISO_function)|AnchorRel = (relative|absolute)|AnchorId = timex of relative id (t0 == absolute)
*
* reference = value of the relative timex (last absolute DATE/TIME)
*/
String pipesline;
String[] pipesarr = null;
String classikline;
String tempexFile = "-";
String tempexTYPE = null;
String tempexVALUE = "-";
String tempexNormType = null;
HashMap<String, String> tempexAttribsHash = null;
String tempexAnchor = null;
String tempexReference = "-";
TIMEK timek = new TIMEK(new Locale(featuresFile.getLanguage()));
// TODO improve NORMALIZATION (MULTI-PHASE) see Ahn and Dale work...
int iob2col = featuresFile.getColumn("element\\(IOB2\\)");
int attrscol = iob2col + 1;
if (iob2col == -1) {
throw new Exception("-- element/attribs column not found.");
}
// DCTs should have an id (otherwise is set as t0 by default)
HashMap<String, String[]> DCTs = TempEvalFiles.getDCTsFromTab(featuresFile.getFile().getCanonicalPath().substring(0, featuresFile.getFile().getCanonicalPath().lastIndexOf("/")) + "/dct.tab");
HashMap<String, String> trainingTempexReferences = new HashMap<String, String>();
try {
while ((pipesline = featuresreader.readLine()) != null) {
pipesarr = pipesline.split("\\|");
if (!attribsCheck && pipesarr.length >= featuresFile.getPipesDescArrCount()) {
if (iob2col == pipesarr.length - 1) {
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
System.err.println("No attribs found. Formating file for testing");
}
} else {
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
System.err.println("Attribs found. Formating file for training");
}
hasAttribs = true;
}
attribsCheck = true;
}
if (pipesarr.length >= featuresFile.getPipesDescArrCount()) {
//System.out.println(pipesline);
if (pipesarr[iob2col].matches("B-(?:timex|TIMEX)3?.*")) {
classikline = classikreader.readLine();
String[] classikarr = classikline.split("\\|");
tempexFile = pipesarr[0];
tempexTYPE = classikarr[classikarr.length - 1];
String tempexNormText = classikarr[classikFile.getColumn("extra1")];
String tempexPattern = classikarr[classikFile.getColumn("extra2")];
// For training the type and value are known and the normalization type is guessable
if (hasAttribs) {
tempexAttribsHash = XmlAttribs.parseAttrs(pipesarr[attrscol]);
tempexVALUE = tempexAttribsHash.get("value");
tempexTYPE = tempexAttribsHash.get("type");
// esto tendría q inicializarse antes de empezar (2 pasadas Dale et al.)
if ((tempexTYPE.equalsIgnoreCase("DATE") || tempexTYPE.equalsIgnoreCase("TIME")) && tempexVALUE.matches("[0-9]{4}.*")) {
trainingTempexReferences.put(tempexAttribsHash.get("tid"), tempexVALUE);
}
// guess the NormType and put the reference if needed
/*if (tempexAttribsHash.containsKey("anchorTimeID") && trainingTempexReferences.get(tempexAttribsHash.get("anchorTimeID"))!=null) {
tempexAnchor = tempexAttribsHash.get("anchorTimeID");
tempexNormType = "ISOFR";
tempexReference = trainingTempexReferences.get(tempexAnchor);
} else {*/
tempexNormType = TIMEK.getNormType(tempexVALUE);
if (tempexNormType.equalsIgnoreCase("ISO")) {
if (!tempexNormText.matches("(?:(?:.*_)?[0-9]{4}(?:_.*)?|[0-9]{1,2}[./-][0-9]{1,2}[./-][0-9]{1,4})") && !tempexNormText.matches("(?:.*_)?" + timek.Decades_re) && !tempexNormText.matches("(?:.*_)*(?:(?:the|el)_)?[0-9]+_(?:year|century|millennium)") && !tempexNormText.matches("(?:.*_)*(?:el_)*(?:año|siglo|milenio)_[0-9]+(?:_.*)?")) {
tempexNormType = "ISOFA";
}
}
//}
}
// For testing, the value is unknown and the normalization type must be guessed
// Write the train or test feature-vector
/*System.out.println(classikline);
System.out.println(tempexTYPE);
System.out.println(tempexFile);
System.out.println(DCTs.get(tempexFile)[0]);
System.out.println(tempexReference);*/
outfile.write(classikline.substring(0, classikline.lastIndexOf('|')) + "|" + tempexTYPE + "|" + DCTs.get(tempexFile)[0] + "|" + tempexReference+"|"+tempexVALUE);
if (hasAttribs) {
outfile.write("|"+tempexNormType);
}
outfile.write("\n");
}
}
}
} finally {
if (featuresreader != null) {
featuresreader.close();
}
if (classikreader != null) {
classikreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TIMEN):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
/**
* Returns the input PipesFile (string), annotated with the ISO value for a language.
*
* @param features_and_attributes input filename (base-segmentation.TempEval2-features-annotated-with-TIMEN)
* @param lang language code (en for English, es for Spanish)
*
* @return outputfilename
*/
public static String get_normalized_values(String timenf, String lang) {
String output = null;
PipesFile nlpfile = new PipesFile(timenf);
((PipesFile) nlpfile).isWellFormedOptimist();
nlpfile.setLanguage(lang);
output = getNormalizedValues((PipesFile) nlpfile);
return output;
}
/**
* Returns the input PipesFile (TIMEN), annotated with the ISO 8601 value for a language.
*
* @param pipesfile input PipesFile (TIMEN)
* @param lang language code (en for English, es for Spanish)
*
* @return outputfile
*/
public static String getNormalizedValues(PipesFile timenFile) {
String outputfile = null;
int linen = 0;
try {
outputfile = timenFile.getFile().getCanonicalPath() + "-normalized_values";
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader pipesreader = new BufferedReader(new FileReader(timenFile.getFile()));
int TEnormtypecol = timenFile.getLastDescColumn();
int TEpatterncol = timenFile.getColumn("extra2");
int TEnumvalcol = timenFile.getColumn("extra1");
int TEtensecol = timenFile.getColumn("tense");
int TEdctcol = timenFile.getColumn("DCT");
int TErelrefcol = timenFile.getColumn("ref-val");
TIMEK timek = new TIMEK(new Locale(timenFile.getLanguage()));
String TEnormtype = null;
String lastTempexReference = null;
String curr_fileid = "";
String pipesline = null;
String[] pipesarr = null;
try {
while ((pipesline = pipesreader.readLine()) != null) {
pipesarr = pipesline.split("\\|");
linen++;
if (pipesarr.length >= timenFile.getPipesDescArrCount()) {
if (TEnormtypecol < (pipesarr.length - 1)) {
TEnormtypecol = pipesarr.length - 1;
}
// Initialize reference as DCT for each file
if (!curr_fileid.equals(pipesarr[0])) {
curr_fileid = pipesarr[0];
lastTempexReference = pipesarr[TEdctcol];
}
TEnormtype = pipesarr[TEnormtypecol];
String[] val = pipesarr[TEnumvalcol].split("_");
String[] pat = pipesarr[TEpatterncol].split("_");
switch (NormTypes.valueOf(TEnormtype)) {
case PERIOD:
//BUILD EXPRESSION
// TODO meter esto en TIMEK como funcion... solo vale YMD y THMS y NI como excepcion
// todo lo demás se tiene q pasar a unidades inferiores (media hora) --> 30 minutos y redondear
// 2.5 semanas == 14 dias + 3 o 4 dias (segun redondeo)
outfile.write(pipesline + "|P" + timek.getISOperiod(val, pat) + "\n");
break;
case ISO: // ONLY EXPLICIT ISOs (year is needed unless for decades, centureis, or millennia)
//BUILD DATE EXPRESSION
String date = "";
Boolean inTE = false;
if(pipesarr[TEpatterncol].equalsIgnoreCase("Date")){
date=pipesarr[TEnumvalcol];
}else{
int iEnd = -1;
for (int i = pat.length - 1; i >= 0; i--) {
if (!pat[i].matches("(TMonth|Num|s|TUnit|" + timek.Decades_re + "|(mid-)?[0-9]{4}s|[0-9]{4}[-/][0-9]{2}[-/][0-9]{2}|[0-9]{2}[:][0-9]{2}([:][0-9]{2})?)")) {
iEnd = i;
} else {
break;
}
}
for (int i = 0; i < pat.length; i++) {
if (i == iEnd) {
break;
}
if (pat[i].matches("(TMonth|Num|TUnit|" + timek.Decades_re + "|(mid-)?[0-9]{4}(s)?|[0-9]{4}[-/][0-9]{2}[-/][0-9]{2})")) {
inTE = true;
}
if (inTE) {
if (pat[i].equals("Num")) {
date += " " + val[i];
}
if (!pat[i].equals("Num")) {
if (!val[i].equals("s")) {
date += " ";
}
date += val[i];
}
}
}
}
String iso_explicit = pipesarr[TEdctcol];
if (date.equals("")) {
if (!pipesarr[TEnumvalcol].matches("[0-9]{4}[-/][0-9]{4}")) {
System.err.println("Malformed ISO explicit date (empty): " + pipesarr[TEnumvalcol] + " - " + pipesarr[TEpatterncol]);
}
}
if (!pipesarr[TEnumvalcol].matches("[0-9]{4}[-/][0-9]{4}") && !date.isEmpty()) {
iso_explicit = timek.toISO8601(date.trim());
} else {
iso_explicit = pipesarr[TEnumvalcol].replaceAll("-", "/");
}
outfile.write(pipesline + "|" + iso_explicit + "\n");
lastTempexReference = iso_explicit;
break;
case ISOFR:
if (pipesarr[TErelrefcol].equals("-")) {
pipesarr[TErelrefcol] = lastTempexReference;
}
String isofr = timek.obtainImplicitDate(pipesarr[TEnumvalcol], pipesarr[TEpatterncol], pipesarr[TEtensecol], pipesarr[TErelrefcol]);
outfile.write(pipesline + "|" + isofr + "\n");
lastTempexReference = isofr;
break;
case ISOFA:
String isofa = timek.obtainImplicitDate(pipesarr[TEnumvalcol], pipesarr[TEpatterncol], pipesarr[TEtensecol], pipesarr[TEdctcol]);
outfile.write(pipesline + "|" + isofa + "\n");
lastTempexReference = isofa;
break;
case ISOSET:
String set = timek.obtainISOSet(pipesarr[TEnumvalcol], pipesarr[TEpatterncol]);
outfile.write(pipesline + "|" + set + "\n");
break;
case PRESENT_REF:
case PAST_REF:
case FUTURE_REF:
if (TEnormtype.equals("PRESENT_REF")) {
lastTempexReference = pipesarr[TEdctcol];
}
outfile.write(pipesline + "|" + TEnormtype + "\n");
break;
}
}
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TIMEN):\n\t" + e.toString() + " (Line " + linen + ")\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
/** Stupid baseline REMOVE**/
public static String get_normalized_values_baseline(String timenf) {
String output;
PipesFile nlpfile = new PipesFile(timenf);
((PipesFile) nlpfile).isWellFormedOptimist();
output = getNormalizedValuesBaseline((PipesFile) nlpfile);
return output;
}
public static String getNormalizedValuesBaseline(PipesFile pipesfile) {
String outputfile = null;
int linen = 0;
try {
outputfile = pipesfile.getFile().getCanonicalPath() + "-normalized_values";
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile()));
int TEdctcol = pipesfile.getColumn("DCT");
String pipesline = null;
String[] pipesarr = null;
try {
while ((pipesline = pipesreader.readLine()) != null) {
pipesarr = pipesline.split("\\|");
linen++;
if (pipesarr.length >= pipesfile.getPipesDescArrCount()) {
outfile.write(pipesline + "|" + pipesarr[TEdctcol] + "\n");
}
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TIMEN):\n\t" + e.toString() + " (Line " + linen + ")\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
public static String get_key_normalized_values(String timenf) {
String output = timenf + "-key";
int linen = 0;
try {
BufferedWriter outfile = new BufferedWriter(new FileWriter(output));
BufferedReader pipesreader = new BufferedReader(new FileReader(timenf));
PipesFile nlpfile = new PipesFile(timenf);
((PipesFile) nlpfile).isWellFormedOptimist();
int valuecol=nlpfile.getColumn("value");
try {
String pipesline;
String[] pipesarr = null;
while ((pipesline = pipesreader.readLine()) != null) {
linen++;
pipesarr = pipesline.split("\\|");
outfile.write(pipesline + "|" + pipesarr[valuecol] + "\n");
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval-Experimenter):\n\t" + e.toString() + " - line:" + linen + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return output;
}
}