package com.cognitionis.nlp_files;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import com.cognitionis.utils_basickit.FileUtils;
import com.cognitionis.utils_basickit.XmlAttribs;
/**
*
* @author Héctor Llorens
* @since 2011
*/
public class TempEvalFiles {
public static void divide_nfolds(File file, int folds, boolean includetest) {
try {
// create folder data2fold
file = create_data2fold(file, includetest);
LineNumberReader lnr = new LineNumberReader(new FileReader(file));
lnr.skip(Long.MAX_VALUE);
int numlines = lnr.getLineNumber();
int lines_per_fold = numlines / folds;
int lines_margin = lines_per_fold / 5; // 20%
System.err.println("Number of Lines: " + numlines);
// build 10-fold array with base-segmentation.tab
ArrayList<String[]> file_markers = get_file_markers(file, folds, lines_per_fold, lines_margin);
// build the folds for each file given the lines per fold
File dir = new File((new File(file.getAbsolutePath())).getParent());
String parent_path = dir.getParent();
for (int i = 0; i < folds; i++) {
create_folded_data(file.getName(), parent_path, file_markers, i,"\t");
create_folded_data("dct.tab", parent_path, file_markers, i,"\t");
create_folded_data("timex-extents.tab", parent_path, file_markers, i,"\t");
create_folded_data("timex-attributes.tab", parent_path, file_markers, i,"\t");
create_folded_data("event-extents.tab", parent_path, file_markers, i,"\t");
create_folded_data("event-attributes.tab", parent_path, file_markers, i,"\t");
create_folded_data("base-segmentation.TempEval2-features", parent_path, file_markers, i,"\\|");
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
}
}
public static File create_data2fold(File file, boolean includetest) {
File data2fold_file = null;
try {
File dir = new File((new File(file.getAbsolutePath())).getParent());
String parent_path = dir.getParent();
File newdir = new File(parent_path + File.separator + "data2fold");
if (!newdir.exists() || !newdir.isDirectory()) {
newdir.mkdir();
}
// copy all the files to newdir
if ((new File(newdir + File.separator + file.getName()).exists())) {
(new File(newdir + File.separator + file.getName())).delete();
(new File(newdir + File.separator + "dct.tab")).delete();
(new File(newdir + File.separator + "timex-extents.tab")).delete();
(new File(newdir + File.separator + "timex-attributes.tab")).delete();
(new File(newdir + File.separator + "event-extents.tab")).delete();
(new File(newdir + File.separator + "event-attributes.tab")).delete();
(new File(newdir + File.separator + "base-segmentation.TempEval2-features")).delete();
}
FileUtils.copyFileUtil(file, new File(newdir + File.separator + file.getName()));
FileUtils.copyFileUtil(new File(dir + File.separator + "dct.tab"), new File(newdir + File.separator + "dct.tab"));
FileUtils.copyFileUtil(new File(dir + File.separator + "timex-extents.tab"), new File(newdir + File.separator + "timex-extents.tab"));
FileUtils.copyFileUtil(new File(dir + File.separator + "timex-attributes.tab"), new File(newdir + File.separator + "timex-attributes.tab"));
FileUtils.copyFileUtil(new File(dir + File.separator + "event-extents.tab"), new File(newdir + File.separator + "event-extents.tab"));
FileUtils.copyFileUtil(new File(dir + File.separator + "event-attributes.tab"), new File(newdir + File.separator + "event-attributes.tab"));
FileUtils.copyFileUtil(new File(dir + File.separator + "base-segmentation.TempEval2-features"), new File(newdir + File.separator + "base-segmentation.TempEval2-features"));
// merge test files
if (dir.getName().endsWith("train") && includetest && (new File(parent_path + File.separator + "test")).exists()) {
// If test-entities exists then break (bad structure)
FileUtils.copyFileUtilappend(new File(parent_path + File.separator + "test" + File.separator + file.getName()), new File(newdir + File.separator + file.getName()));
FileUtils.copyFileUtilappend(new File(parent_path + File.separator + "test" + File.separator + "dct.tab"), new File(newdir + File.separator + "dct.tab"));
FileUtils.copyFileUtilappend(new File(parent_path + File.separator + "test" + File.separator + "timex-extents.tab"), new File(newdir + File.separator + "timex-extents.tab"));
FileUtils.copyFileUtilappend(new File(parent_path + File.separator + "test" + File.separator + "timex-attributes.tab"), new File(newdir + File.separator + "timex-attributes.tab"));
FileUtils.copyFileUtilappend(new File(parent_path + File.separator + "test" + File.separator + "event-extents.tab"), new File(newdir + File.separator + "event-extents.tab"));
FileUtils.copyFileUtilappend(new File(parent_path + File.separator + "test" + File.separator + "event-attributes.tab"), new File(newdir + File.separator + "event-attributes.tab"));
FileUtils.copyFileUtilappend(new File(parent_path + File.separator + "test" + File.separator + "base-segmentation.TempEval2-features"), new File(newdir + File.separator + "base-segmentation.TempEval2-features"));
}
data2fold_file = new File(newdir + File.separator + file.getName());
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return data2fold_file;
}
public static ArrayList<String[]> get_file_markers(File file, int folds, int lines_per_fold, int lines_margin) {
ArrayList<String[]> file_markers;
String[] current_filemarker;
int linen = 0;
try {
file_markers = new ArrayList<String[]>();
current_filemarker = new String[2];
BufferedReader reader = new BufferedReader(new FileReader(file));
try {
String line;
String[] tabarr = null;
String current_fileid = "";
int current_fold = 1;
while ((line = reader.readLine()) != null) {
linen++;
tabarr = line.split("\t");
// save the file/line possibility
if (!current_fileid.equals(tabarr[0]) && linen >= (lines_per_fold * current_fold - lines_margin)) {
current_filemarker = null;
current_filemarker = new String[2];
current_filemarker[0] = current_fileid;
current_fileid = tabarr[0];
//System.err.println(linen);
current_filemarker[1] = "" + (linen - 1);
}
if (linen >= (lines_per_fold * current_fold) && current_fold != folds) {
int foldlines = Integer.parseInt(current_filemarker[1]);
if (current_fold > 1) {
foldlines = (Integer.parseInt(current_filemarker[1]) - (Integer.parseInt((file_markers.get((current_fold - 2))[1]))));
}
System.err.println("Fold: " + current_fold + "/" + folds + " file: " + current_filemarker[0] + " line: " + current_filemarker[1] + " lines: " + foldlines + "/" + lines_per_fold + " (" + (foldlines * 100) / lines_per_fold + "%)");
file_markers.add(current_filemarker);
current_fold++;
}
}
current_filemarker = null;
current_filemarker = new String[2];
current_filemarker[0] = tabarr[0];
//System.err.println(linen);
current_filemarker[1] = "" + (linen);
int foldlines = (Integer.parseInt(current_filemarker[1]) - (Integer.parseInt((file_markers.get((current_fold - 2))[1]))));
System.err.println("Fold: " + current_fold + "/" + folds + " file: " + current_filemarker[0] + " line: " + current_filemarker[1] + " lines: " + foldlines + "/" + lines_per_fold + " (" + (foldlines * 100) / lines_per_fold + "%)");
file_markers.add(current_filemarker);
} finally {
if (reader != null) {
reader.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + " - line:" + linen + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return file_markers;
}
public static void create_folded_data(String file, String parent_path, ArrayList<String[]> file_markers, int i, String separator) {
boolean intrain = true;
int linen=0;
try {
File dirtrain = new File(parent_path + File.separator + "train" + (i + 1));
if (!dirtrain.exists() || !dirtrain.isDirectory()) {
dirtrain.mkdir();
}
File dirtest = new File(parent_path + File.separator + "test" + (i + 1));
if (!dirtest.exists() || !dirtest.isDirectory()) {
dirtest.mkdir();
}
String firsttestfilemarker = "";
String lasttestfilemarker = file_markers.get(i)[0];
if (i != 0) {
firsttestfilemarker = file_markers.get(i - 1)[0];
}
BufferedReader reader = new BufferedReader(new FileReader(parent_path+File.separator+"data2fold"+File.separator+file));
BufferedWriter outtrain = new BufferedWriter(new FileWriter(dirtrain + File.separator + file));
BufferedWriter outtest = new BufferedWriter(new FileWriter(dirtest + File.separator + file));
try {
String line;
String[] tabarr = null;
boolean lastfile = false;
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
System.err.println(dirtrain + " " + dirtest + " " + (i + 1) + " 1-" + firsttestfilemarker + " 2-" + lasttestfilemarker);
}
while ((line = reader.readLine()) != null) {
linen++;
tabarr = line.split(separator);
if (firsttestfilemarker.equals("")) {
firsttestfilemarker = tabarr[0];
}
if (firsttestfilemarker.equals(tabarr[0])) {
intrain = false;
}
// check lastfile
if (lasttestfilemarker.equals(tabarr[0])) {
lastfile = true;
}
if (!lasttestfilemarker.equals(tabarr[0]) && lastfile) {
lastfile = false;
intrain = true;
}
if (intrain) {
outtrain.write(line+"\n");
} else {
outtest.write(line+"\n");
}
}
} finally {
if (reader != null) {
reader.close();
}
if (outtrain != null) {
outtrain.close();
}
if (outtest != null) {
outtest.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
}
}
public static String merge_extents(String features, String extentstab, String elem) {
String output;
TabFile tf = new TabFile(extentstab);
tf.isWellFormatted();
output = ((TabFile) tf).getPipesFile();
output = FileUtils.renameTo(output, "-extents\\.tab\\.pipes", "\\.TempEval-extents");
PipesFile nlpfile = new PipesFile(features);
((PipesFile) nlpfile).isWellFormedOptimist();
String temp = output;
output = merge_extents(((PipesFile) nlpfile), elem);
(new File(temp)).delete();
return output;
}
public static String merge_extents(PipesFile pipesfile, String elemext) {
String outputfile = null;
try {
outputfile = pipesfile.getFile().getCanonicalPath() + "-annotationKey-" + elemext;
//String extentsfile = pipesfile.getFile().getCanonicalPath().substring(0, pipesfile.getFile().getCanonicalPath().indexOf(".")) + "." + elemext + ".TempEvalFiles-extents";
String extentsfile = pipesfile.getFile().getParent() + "/" + elemext + ".TempEval-extents";
PipesFile keypipes = new PipesFile(extentsfile);
keypipes.isWellFormedOptimist();
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader extentsreader = new BufferedReader(new FileReader(extentsfile));
BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile()));
try {
String extentline;
String[] extentarr = null;
String pipesline;
String[] pipesarr = null;
String extentId = ""; // save id
String curr_fileid = "";
while ((pipesline = pipesreader.readLine()) != null) {
pipesarr = pipesline.split("\\|");
if (extentarr == null && (extentline = extentsreader.readLine()) != null) {
extentarr = extentline.split("\\|");
// to avoid joining t1 at the end of a file and t1 at the begining of the next
if (!curr_fileid.equals(extentarr[0])) {
extentId = "";
curr_fileid = extentarr[0];
}
}
if (extentarr != null) {
if (pipesarr[0].equals(extentarr[0]) && pipesarr[1].equals(extentarr[1]) && pipesarr[2].equals(extentarr[2])) {
if (!extentId.equals(extentarr[4])) {
outfile.write(pipesline + "|B-" + extentarr[3] + "\n");
} else {
outfile.write(pipesline + "|I-" + extentarr[3] + "\n");
}
extentId = extentarr[4];
extentarr = null;
} else {
outfile.write(pipesline + "|O\n");
extentId = "";
}
} else {
outfile.write(pipesline + "|O\n");
extentId = "";
}
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (extentsreader != null) {
extentsreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
public static String merge_extents_class(String features, String extentstab, String elem) {
String output;
TabFile tf = new TabFile(extentstab);
tf.isWellFormatted();
output = ((TabFile) tf).getPipesFile();
output = FileUtils.renameTo(output, "-extents\\.tab\\.pipes", "\\.TempEval-extents");
PipesFile nlpfile = new PipesFile(features);
((PipesFile) nlpfile).isWellFormedOptimist();
String temp = output;
output = merge_extents_class(((PipesFile) nlpfile), elem);
(new File(temp)).delete();
return output;
}
public static String merge_extents_class(PipesFile pipesfile, String elemext) {
String outputfile = null;
try {
outputfile = pipesfile.getFile().getCanonicalPath() + "-annotationKey-" + elemext + "-class";
//String extentsfile = pipesfile.getFile().getCanonicalPath().substring(0, pipesfile.getFile().getCanonicalPath().indexOf(".")) + "." + elemext + ".TempEvalFiles-extents";
String extentsfile = pipesfile.getFile().getParent() + "/" + elemext + ".TempEval-extents";
String attrfile = pipesfile.getFile().getParent() + "/" + elemext + "-attributes.tab";
/*PipesFile attrpipes = new PipesFile();
attrpipes.loadFile(new File(attrfile));
attrpipes.isWellFormedOptimist();*/
PipesFile keypipes = new PipesFile(extentsfile);
keypipes.isWellFormedOptimist();
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader extentsreader = new BufferedReader(new FileReader(extentsfile));
BufferedReader attrsreader = new BufferedReader(new FileReader(attrfile));
BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile()));
try {
String extentline;
String[] extentarr = null;
String attrline;
String[] attrarr = null;
String attrclass = "";
String pipesline;
String[] pipesarr = null;
String extentId = ""; // save id
String curr_fileid = "";
while ((pipesline = pipesreader.readLine()) != null) {
pipesarr = pipesline.split("\\|");
if (extentarr == null && (extentline = extentsreader.readLine()) != null) {
extentarr = extentline.split("\\|");
// to avoid joining t1 at the end of a file and t1 at the begining of the next
if (!curr_fileid.equals(extentarr[0])) {
extentId = "";
curr_fileid = extentarr[0];
}
}
if (extentarr != null) {
if (pipesarr[0].equals(extentarr[0]) && pipesarr[1].equals(extentarr[1]) && pipesarr[2].equals(extentarr[2])) {
if (!extentId.equals(extentarr[4])) {
while ((attrline = attrsreader.readLine()) != null) {
//System.out.println(attrline);
attrarr = attrline.split("\t");
if (attrarr[6].matches("(?i)(class|type)")) {
attrclass = attrarr[7];
break;
}
}
outfile.write(pipesline + "|B-" + extentarr[3] + "-" + attrclass + "\n");
} else {
outfile.write(pipesline + "|I-" + extentarr[3] + "-" + attrclass + "\n");
}
extentId = extentarr[4];
extentarr = null;
} else {
outfile.write(pipesline + "|O\n");
extentId = "";
}
} else {
outfile.write(pipesline + "|O\n");
extentId = "";
}
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (extentsreader != null) {
extentsreader.close();
}
if (attrsreader != null) {
attrsreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
public static String split_extents_attrib(PipesFile pipesfile, String attrib) {
String outputfile = null;
try {
outputfile = pipesfile.getFile().getCanonicalPath() + "-attribs-" + attrib;
//String extentsfile = pipesfile.getFile().getCanonicalPath().substring(0, pipesfile.getFile().getCanonicalPath().indexOf(".")) + "." + elemext + ".TempEvalFiles-extents";
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile()));
try {
String pipesline;
String[] pipesarr = null;
int elembiocol = pipesfile.getColumn("element\\(IOB2\\)");
while ((pipesline = pipesreader.readLine()) != null) {
pipesarr = pipesline.split("\\|");
if (pipesarr[elembiocol].equalsIgnoreCase("O")) {
outfile.write(pipesline + "|-\n");
} else {
String attribval = "EMPTY";
String[] elemsplit = pipesarr[elembiocol].split("-");
if (elemsplit.length >= 3) {
attribval = elemsplit[2];
}
outfile.write(pipesline.substring(0, pipesline.lastIndexOf("-")) + "|");
if (elemsplit[0].equalsIgnoreCase("B")) {
outfile.write(attrib + "=" + attribval + "\n");
} else {
outfile.write("-\n");
}
}
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
public static String merge_attribs(String features_and_extents, String attribstab, String elem) {
String output;
TabFile tf = new TabFile(attribstab);
tf.isWellFormatted();
output = ((TabFile) tf).getPipesFile();
output = FileUtils.renameTo(output, "-attributes\\.tab\\.pipes", "\\.TempEval-attributes");
PipesFile nlpfile = new PipesFile(features_and_extents);
((PipesFile) nlpfile).isWellFormedOptimist();
String temp = output;
output = merge_attribs(((PipesFile) nlpfile), elem);
(new File(temp)).delete();
return output;
}
public static String merge_attribs(PipesFile pipesfile, String elemattr) {
String outputfile = null;
int linen = 0;
try {
outputfile = pipesfile.getFile().getCanonicalPath() + "-attribs";
//String attrfile = pipesfile.getFile().getCanonicalPath().substring(0, pipesfile.getFile().getCanonicalPath().indexOf(".")) + "." + elemattr + ".TempEvalFiles-attributes";
String attrfile = pipesfile.getFile().getParent() + "/" + elemattr + ".TempEval-attributes";
PipesFile attrpipes = new PipesFile(attrfile);
attrpipes.isWellFormedOptimist();
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader extentsreader = new BufferedReader(new FileReader(attrfile));
BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile()));
int elemcol = pipesfile.getColumn("element\\(IOB2\\)");
try {
String attrline = "";
String[] attrarr = null;
String pipesline;
String[] pipesarr = null;
String attrId = ""; // save id
String curr_fileid = "";
while ((pipesline = pipesreader.readLine()) != null) {
linen++;
pipesarr = pipesline.split("\\|");
if (elemcol > pipesarr.length) {
elemcol = pipesarr.length - 1;
}
if (attrarr == null && (attrline = extentsreader.readLine()) != null) {
attrarr = attrline.split("\\|");
attrId = attrarr[4];
curr_fileid = attrarr[0];
}
if (attrarr != null) {
if (pipesarr[0].equals(attrarr[0]) && pipesarr[1].equals(attrarr[1]) && pipesarr[2].equals(attrarr[2])) {
if (!pipesarr[elemcol].equals("B-" + attrarr[3])) {
throw new Exception("Malformed TempEval attribs file (B-element not found for attribs)\n" + pipesline + "\n" + attrline);
}
outfile.write(pipesline + "|" + elemattr.substring(0, 1).toLowerCase() + "id=\"" + attrarr[4] + "\" " + attrarr[6] + "=\"" + attrarr[7] + "\"");
while ((attrline = extentsreader.readLine()) != null) {
attrarr = attrline.split("\\|");
if (attrId.equals(attrarr[4]) && curr_fileid.equals(attrarr[0])) {
outfile.write(" " + attrarr[6] + "=\"" + attrarr[7] + "\"");
attrarr = null;
} else {
attrId = attrarr[4];
curr_fileid = attrarr[0];
break;
}
}
outfile.write("\n");
} else {
if (pipesarr[elemcol].equals("B-" + elemattr)) {
throw new Exception("Malformed TempEval attribs file (B-" + elemattr + " found with no attribs)\n" + pipesline + "\n" + attrline);
}
outfile.write(pipesline + "|-\n");
}
} else {
if (pipesarr[elemcol].equals("B-" + elemattr)) {
throw new Exception("Malformed TempEval attribs file (B-" + elemattr + " found with no attribs)\n" + pipesline + "\n" + attrline);
}
outfile.write(pipesline + "|-\n");
}
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (extentsreader != null) {
extentsreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + " - line:" + linen + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
public static String merge_extents_and_attribs(PipesFile pipesfile, String elem) {
String outputfile = null;
int linen = 0;
try {
outputfile = pipesfile.getFile().getCanonicalPath() + "-annotationKey-" + elem + "-attribs";
String extentsfile = pipesfile.getFile().getParent() + "/" + elem + "-extents.tab";
String attrsfile = pipesfile.getFile().getParent() + "/" + elem + "-attributes.tab";
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile()));
BufferedReader extentsreader = new BufferedReader(new FileReader(extentsfile));
BufferedReader attrsreader = new BufferedReader(new FileReader(attrsfile));
try {
String pipesline;
String[] pipesarr = null;
String extentline = "";
String[] extentarr = null;
String extentId = ""; // save id
String attrline = "";
String[] attrarr = null;
String attrId = ""; // save id
String curr_fileid = "";
while ((pipesline = pipesreader.readLine()) != null) {
linen++;
//System.out.println(pipesline);
pipesarr = pipesline.split("\\|");
if (extentarr == null && (extentline = extentsreader.readLine()) != null) {
extentarr = extentline.split("\t");
//System.out.println(pipesline+" "+extentline);
// to avoid joining t1 at the end of a file and t1 at the begining of the next
if (!curr_fileid.equals(extentarr[0])) {
extentId = ""; // new id
curr_fileid = extentarr[0];
}
// only in B-elements
if (!extentId.equals(extentarr[4])) { // new id
if (attrarr != null) {
if (!extentarr[0].equals(attrarr[0]) || !extentarr[1].equals(attrarr[1]) || !extentarr[2].equals(attrarr[2]) || !extentarr[3].equals(attrarr[3]) || !extentarr[4].equals(attrarr[4])) {
throw new Exception("Extents-Attributes incongruence:\n\t" + extentline + "\n\t" + attrline);
}
} else {
if ((attrline = attrsreader.readLine()) == null) {
throw new Exception("Attributes for extents (" + extentline + ") missing");
} else {
attrarr = attrline.split("\t");
attrId = attrarr[4];
// corss-check
if (!extentarr[0].equals(attrarr[0]) || !extentarr[1].equals(attrarr[1]) || !extentarr[2].equals(attrarr[2]) || !extentarr[3].equals(attrarr[3]) || !extentarr[4].equals(attrarr[4])) {
throw new Exception("Extents-Attributes incongruence:\n\t" + extentline + "\n\t" + attrline);
}
}
}
}
}
if (extentarr != null) {
if (pipesarr[0].equals(extentarr[0]) && pipesarr[1].equals(extentarr[1]) && pipesarr[2].equals(extentarr[2])) {
//System.out.println(pipesline+" "+extentline);
if (!extentId.equals(extentarr[4])) {
outfile.write(pipesline + "|B-" + extentarr[3] + "|" + elem.substring(0, 1).toLowerCase() + "id=\"" + attrarr[4] + "\" " + attrarr[6] + "=\"" + attrarr[7] + "\"");
attrarr = null; // nullify scheme
while ((attrline = attrsreader.readLine()) != null) {
attrarr = attrline.split("\t");
if (attrarr[7].trim().equals("")) {
throw new Exception("Empty attribute: " + attrline);
}
//System.out.println(attrId+" "+attrarr[4]+" "+curr_fileid+" "+attrarr[0]);
if (attrId.equals(attrarr[4]) && curr_fileid.equals(attrarr[0])) {
outfile.write(" " + attrarr[6] + "=\"" + attrarr[7] + "\"");
attrarr = null;
} else {
attrId = attrarr[4];
break;
}
}
outfile.write("\n");
} else {
outfile.write(pipesline + "|I-" + extentarr[3] + "|-\n");
}
extentId = extentarr[4];
extentarr = null;
} else {
outfile.write(pipesline + "|O|-\n");
extentId = "";
}
} else {
outfile.write(pipesline + "|O|-\n");
extentId = "";
}
}
if (extentarr != null) {
throw new Exception("Extents found without tokens correspondence: " + extentline + " " + elem);
}
if (attrarr != null) {
throw new Exception("Attributes found without tokens correspondence: " + extentline + " " + elem);
}
if ((extentline = extentsreader.readLine()) != null) {
throw new Exception("Some extents not assigned (" + extentline + ") " + elem);
}
if ((attrline = attrsreader.readLine()) != null) {
throw new Exception("Some attributes not assigned (" + attrline + ") " + elem);
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (extentsreader != null) {
extentsreader.close();
}
if (attrsreader != null) {
attrsreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + " - line:" + linen + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
public static String deletespecial_extents_and_attribs(PipesFile pipesfile, String elem) {
String outputfile = null;
int linen = 0;
try {
String extentsfile = pipesfile.getFile().getParent() + "/" + elem + "-extents.tab";
String attrsfile = pipesfile.getFile().getParent() + "/" + elem + "-attributes.tab";
BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile()));
BufferedReader extentsreader = new BufferedReader(new FileReader(extentsfile));
BufferedReader attrsreader = new BufferedReader(new FileReader(attrsfile));
BufferedWriter outextents = new BufferedWriter(new FileWriter(pipesfile.getFile().getParent() + "/" + elem + "-extents.tab2"));
BufferedWriter outattribs = new BufferedWriter(new FileWriter(pipesfile.getFile().getParent() + "/" + elem + "-attributes.tab2"));
try {
String pipesline;
String[] pipesarr = null;
String extentline = "";
String[] extentarr = null;
String extentId = ""; // save id
String attrline = "";
String[] attrarr = null;
String attrId = ""; // save id
String curr_fileid = "";
while ((pipesline = pipesreader.readLine()) != null) {
linen++;
//System.out.println(pipesline);
pipesarr = pipesline.split("\\|");
if (extentarr == null && (extentline = extentsreader.readLine()) != null) {
extentarr = extentline.split("\t");
// to avoid joining t1 at the end of a file and t1 at the begining of the next
if (!curr_fileid.equals(extentarr[0])) {
extentId = ""; // new id
curr_fileid = extentarr[0];
}
// only in B-elements
if (!extentId.equals(extentarr[4])) { // new id
if (attrarr != null) {
if (!extentarr[0].equals(attrarr[0]) || !extentarr[1].equals(attrarr[1]) || !extentarr[2].equals(attrarr[2]) || !extentarr[3].equals(attrarr[3]) || !extentarr[4].equals(attrarr[4])) {
throw new Exception("Extents-Attributes incongruence:\n\t" + extentline + "\n\t" + attrline);
}
} else {
if ((attrline = attrsreader.readLine()) == null) {
throw new Exception("Attributes for extents (" + extentline + ") missing");
} else {
attrarr = attrline.split("\t");
attrId = attrarr[4];
// corss-check
if (!extentarr[0].equals(attrarr[0]) || !extentarr[1].equals(attrarr[1]) || !extentarr[2].equals(attrarr[2]) || !extentarr[3].equals(attrarr[3]) || !extentarr[4].equals(attrarr[4])) {
throw new Exception("Extents-Attributes incongruence:\n\t" + extentline + "\n\t" + attrline);
}
}
}
}
}
if (extentarr != null) {
if (pipesarr[0].equals(extentarr[0]) && pipesarr[1].equals(extentarr[1]) && pipesarr[2].equals(extentarr[2])) {
if (!extentId.equals(extentarr[4])) {
//if(pipesarr[4].matches("(?i)(fue|soy|es|eres|somos|sois|son|era|eras|éramos|erais|eran|SEGUIR POR AQUIIIIIIIIIIIIIes)"))
if (pipesarr[9].matches("(?i)(ser|estar|haber)")) {
System.out.println(pipesline);
} else {
outextents.write(extentline + "\n");
outattribs.write(attrline + "\n");
}
attrarr = null; // nullify scheme
while ((attrline = attrsreader.readLine()) != null) {
attrarr = attrline.split("\t");
if (attrarr[7].trim().equals("")) {
throw new Exception("Empty attribute: " + attrline);
}
//System.out.println(attrId+" "+attrarr[4]+" "+curr_fileid+" "+attrarr[0]);
if (attrId.equals(attrarr[4]) && curr_fileid.equals(attrarr[0])) {
if (!pipesarr[9].matches("(?i)(ser|estar|haber)")) {
outattribs.write(attrline + "\n");
}
attrarr = null;
} else {
attrId = attrarr[4];
break;
}
}
//outfile.write("\n");
} else {
throw new Exception("Multi-token event");
}
extentId = extentarr[4];
extentarr = null;
} else {
extentId = "";
}
} else {
extentId = "";
}
}
if (extentarr != null) {
throw new Exception("Extents found without tokens correspondence: " + extentline + " " + elem);
}
if (attrarr != null) {
throw new Exception("Attributes found without tokens correspondence: " + extentline + " " + elem);
}
if ((extentline = extentsreader.readLine()) != null) {
throw new Exception("Some extents not assigned (" + extentline + ") " + elem);
}
if ((attrline = attrsreader.readLine()) != null) {
throw new Exception("Some attributes not assigned (" + attrline + ") " + elem);
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (extentsreader != null) {
extentsreader.close();
}
if (attrsreader != null) {
attrsreader.close();
}
if (outextents != null) {
outextents.close();
}
if (outattribs != null) {
outattribs.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + " - line:" + linen + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
public static String merge_attribs_specific(String features_and_extents, String attribstab, String elem, String attrib_re, String newattrname) {
String output;
TabFile tf = new TabFile(attribstab);
tf.isWellFormatted();
output = ((TabFile) tf).getPipesFile();
output = FileUtils.renameTo(output, "-attributes\\.tab\\.pipes", "\\.TempEval-attributes");
PipesFile nlpfile = new PipesFile(features_and_extents);
((PipesFile) nlpfile).isWellFormedOptimist();
String temp = output;
output = merge_attribs_specific(((PipesFile) nlpfile), elem, attrib_re, newattrname);
(new File(temp)).delete();
return output;
}
public static String merge_attribs_specific(PipesFile pipesfile, String elemattr, String attr_re, String attr_newname) {
String outputfile = null;
int linen = 0;
try {
outputfile = pipesfile.getFile().getCanonicalPath() + "-attribs";
//String attrfile = pipesfile.getFile().getCanonicalPath().substring(0, pipesfile.getFile().getCanonicalPath().indexOf(".")) + "." + elemattr + ".TempEvalFiles-attributes";
String attrfile = pipesfile.getFile().getParent() + "/" + elemattr + ".TempEval-attributes";
PipesFile attrpipes = new PipesFile(attrfile);
attrpipes.isWellFormedOptimist();
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader extentsreader = new BufferedReader(new FileReader(attrfile));
BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile()));
int elemcol = pipesfile.getColumn("element\\(IOB2\\)");
try {
String attrline = "";
String[] attrarr = null;
String pipesline;
String[] pipesarr = null;
String attrId = ""; // save id
String curr_fileid = "";
while ((pipesline = pipesreader.readLine()) != null) {
linen++;
pipesarr = pipesline.split("\\|");
if (elemcol > pipesarr.length) {
elemcol = pipesarr.length - 1;
}
if (attrarr == null && (attrline = extentsreader.readLine()) != null) {
attrarr = attrline.split("\\|");
attrId = attrarr[4];
curr_fileid = attrarr[0];
}
if (attrarr != null) {
if (pipesarr[0].equals(attrarr[0]) && pipesarr[1].equals(attrarr[1]) && pipesarr[2].equals(attrarr[2])) {
if (!pipesarr[elemcol].equals("B-" + attrarr[3])) {
throw new Exception("Malformed TempEval attribs file (attribs not in B- element)\n" + pipesline + "\n" + attrline);
}
outfile.write(pipesline + "|");
do {
attrarr = attrline.split("\\|");
if (attrId.equals(attrarr[4]) && curr_fileid.equals(attrarr[0])) {
if (attrarr[6].matches(attr_re)) {
outfile.write(";" + attr_newname + "=" + attrarr[7]);
attrarr = null;
}
} else {
attrId = attrarr[4];
curr_fileid = attrarr[0];
break;
}
} while ((attrline = extentsreader.readLine()) != null);
outfile.write("\n");
} else {
outfile.write(pipesline + "|-\n");
}
} else {
outfile.write(pipesline + "|-\n");
}
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (extentsreader != null) {
extentsreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + " - line:" + linen + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
public static String merge_classik(String extentsfile, String attribsfile, String attrib) {
String outputfile = null;
try {
outputfile = extentsfile + ".TempEval2-features-annotatedWith-attribs";
//String extentsfile = pipesfile.getFile().getCanonicalPath().substring(0, pipesfile.getFile().getCanonicalPath().indexOf(".")) + "." + elemext + ".TempEvalFiles-extents";
PipesFile keypipes = new PipesFile(extentsfile);
keypipes.isWellFormedOptimist();
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader extentsreader = new BufferedReader(new FileReader(extentsfile));
BufferedReader pipesreader = new BufferedReader(new FileReader(new File(attribsfile)));
try {
String extentline;
String[] extentarr = null;
String pipesline;
String[] pipesarr = null;
while ((extentline = extentsreader.readLine()) != null) {
extentarr = extentline.split("\\|");
if (pipesarr == null && (pipesline = pipesreader.readLine()) != null) {
pipesarr = pipesline.split("\\|");
}
if (pipesarr != null) {
if (pipesarr[0].equals(extentarr[0]) && pipesarr[1].equals(extentarr[1]) && pipesarr[2].equals(extentarr[2])) {
outfile.write(extentline + "|" + attrib + "=\"" + pipesarr[pipesarr.length - 1] + "\"\n");
pipesarr = null;
} else {
outfile.write(extentline + "|-\n");
}
} else {
outfile.write(extentline + "|-\n");
}
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (extentsreader != null) {
extentsreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
public static String merge_classik(PipesFile pipesfile, String attrib) {
String outputfile = null;
try {
outputfile = pipesfile.getFile().getCanonicalPath() + ".TempEval-features-annotatedWith-attribs";
//String extentsfile = pipesfile.getFile().getCanonicalPath().substring(0, pipesfile.getFile().getCanonicalPath().indexOf(".")) + "." + elemext + ".TempEvalFiles-extents";
String extentsfile = pipesfile.getFile().getCanonicalPath().substring(0, pipesfile.getFile().getCanonicalPath().lastIndexOf('.'));
PipesFile keypipes = new PipesFile(extentsfile);
keypipes.isWellFormedOptimist();
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader extentsreader = new BufferedReader(new FileReader(extentsfile));
BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile()));
try {
String extentline;
String[] extentarr = null;
String pipesline;
String[] pipesarr = null;
while ((extentline = extentsreader.readLine()) != null) {
extentarr = extentline.split("\\|");
if (pipesarr == null && (pipesline = pipesreader.readLine()) != null) {
pipesarr = pipesline.split("\\|");
}
if (pipesarr != null) {
if (pipesarr[0].equals(extentarr[0]) && pipesarr[1].equals(extentarr[1]) && pipesarr[2].equals(extentarr[2])) {
outfile.write(extentline + "|" + attrib + "=" + pipesarr[pipesarr.length - 1] + "\n");
pipesarr = null;
} else {
outfile.write(extentline + "|-\n");
}
} else {
outfile.write(extentline + "|-\n");
}
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (extentsreader != null) {
extentsreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
public static String merge_classik_append(String appendfile, PipesFile pipesfile, String attrib) {
String outputfile = null;
try {
outputfile = pipesfile.getFile().getCanonicalPath() + ".TempEval-features-annotatedWith-attribs-append";
//String extentsfile = pipesfile.getFile().getCanonicalPath().substring(0, pipesfile.getFile().getCanonicalPath().indexOf(".")) + "." + elemext + ".TempEvalFiles-extents";
String extentsfile = appendfile;
PipesFile keypipes = new PipesFile(extentsfile);
keypipes.isWellFormedOptimist();
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader extentsreader = new BufferedReader(new FileReader(extentsfile));
BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile()));
try {
String extentline;
String[] extentarr = null;
String pipesline;
String[] pipesarr = null;
while ((extentline = extentsreader.readLine()) != null) {
extentarr = extentline.split("\\|");
if (pipesarr == null && (pipesline = pipesreader.readLine()) != null) {
pipesarr = pipesline.split("\\|");
}
if (pipesarr != null) {
if (pipesarr[0].equals(extentarr[0]) && pipesarr[1].equals(extentarr[1]) && pipesarr[2].equals(extentarr[2])) {
outfile.write(extentline + ";" + attrib + "=" + pipesarr[pipesarr.length - 1] + "\n");
pipesarr = null;
} else {
outfile.write(extentline + "\n");
}
} else {
outfile.write(extentline + "\n");
}
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (extentsreader != null) {
extentsreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
public static String generate_tab_extents(PipesFile pipesfile) {
String outputfile = null;
int linen = 0;
try {
int iob2col = pipesfile.getColumn("element\\(IOB2\\)");
if (iob2col == -1) {
iob2col = pipesfile.getLastDescColumn();
}
outputfile = pipesfile.getFile().getCanonicalPath() + "-extents.tab";
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile()));
try {
String pipesline;
String[] pipesarr = null;
int elemid = 0; // save id per file
String curr_fileid = "";
while ((pipesline = pipesreader.readLine()) != null) {
linen++;
pipesarr = pipesline.split("\\|");
if (!curr_fileid.equals(pipesarr[0])) {
elemid = 0;
curr_fileid = pipesarr[0];
}
if (!pipesarr[iob2col].equals("O")) {
String iob2 = pipesarr[iob2col].substring(0, 2);
String element = pipesarr[iob2col].substring(2);
if (iob2.equals("B-")) {
elemid++;
}
//outfile.write(pipesarr[0] + "\t" + pipesarr[1] + "\t" + pipesarr[2].substring(0, pipesarr[2].indexOf('-')) + "\t" + element.toLowerCase() + "\t" + element.substring(0, 1).toLowerCase() + elemid + "\t1\n");
outfile.write(pipesarr[0] + "\t" + pipesarr[1] + "\t" + pipesarr[2] + "\t" + element.toLowerCase() + "\t" + element.substring(0, 1).toLowerCase() + elemid + "\t1\n");
}
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + " (" + linen + ")\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
public static String generate_tab_attribs(PipesFile pipesfile) {
String outputfile = null;
int linen = 0;
try {
int iob2col = pipesfile.getColumn("element\\(IOB2\\)");
if (iob2col == -1) {
iob2col = pipesfile.getLastDescColumn() - 1;
}
int attrscol = iob2col + 1;
outputfile = pipesfile.getFile().getCanonicalPath() + "-attributes.tab";
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile()));
try {
String pipesline;
String[] pipesarr = null;
int elemid = 0; // save id per file
String curr_fileid = "";
while ((pipesline = pipesreader.readLine()) != null) {
linen++;
pipesarr = pipesline.split("\\|");
if (!curr_fileid.equals(pipesarr[0])) {
elemid = 0;
curr_fileid = pipesarr[0];
}
//System.err.println(pipesline + " ----> "+pipesarr[iob2col]+" attrcol ("+attrscol+") l="+pipesarr.length);
if (!pipesarr[attrscol].equals("-") && !pipesarr[attrscol].equals("*")) {
//System.err.println(pipesline + " ----> "+pipesarr[iob2col]);
if (pipesarr[attrscol].matches(".*[^\"=]\" .*")) { // spaces
String attrs = pipesarr[attrscol].replaceAll("\\s+", " ").trim();
if (attrs.endsWith("\"")) {
attrs = attrs.substring(0, attrs.length() - 1);
}
String[] attrsarr = attrs.split("\" ");
String element = pipesarr[iob2col].substring(2);
elemid++;
for (int i = 0; i < attrsarr.length; i++) {
if (attrsarr[i].matches("[^=]+=\"[^=]+")) {
String attrname = attrsarr[i].substring(0, attrsarr[i].indexOf("=\""));
String attrvalue = attrsarr[i].substring(attrsarr[i].indexOf("=\"") + 2);
if (attrvalue.matches("\".*\"")) {
attrvalue = attrvalue.substring(1, attrvalue.length() - 1);
}
if (!attrname.matches("(t|e)id")) {
//outfile.write(pipesarr[0] + "\t" + pipesarr[1] + "\t" + pipesarr[2].substring(0, pipesarr[2].indexOf('-')) + "\t" + element.toLowerCase() + "\t" + element.substring(0, 1).toLowerCase() + elemid + "\t1\t" + attrname + "\t" + attrvalue + "\n");
outfile.write(pipesarr[0] + "\t" + pipesarr[1] + "\t" + pipesarr[2] + "\t" + element.toLowerCase() + "\t" + element.substring(0, 1).toLowerCase() + elemid + "\t1\t" + attrname + "\t" + attrvalue + "\n");
}
}
}
} else { // semicolon
String[] attrsarr = pipesarr[attrscol].trim().split(";");
String element = pipesarr[iob2col].substring(2);
elemid++;
for (int i = 0; i < attrsarr.length; i++) {
String attrname = attrsarr[i].substring(0, attrsarr[i].indexOf('='));
String attrvalue = attrsarr[i].substring(attrsarr[i].indexOf("=") + 1);
if (attrvalue.matches("\".*\"")) {
attrvalue = attrvalue.substring(1, attrvalue.length() - 1);
}
if (!attrname.matches("(t|e)id")) {
//outfile.write(pipesarr[0] + "\t" + pipesarr[1] + "\t" + pipesarr[2].substring(0, pipesarr[2].indexOf('-')) + "\t" + element.toLowerCase() + "\t" + element.substring(0, 1).toLowerCase() + elemid + "\t1\t" + attrname + "\t" + attrvalue + "\n");
outfile.write(pipesarr[0] + "\t" + pipesarr[1] + "\t" + pipesarr[2] + "\t" + element.toLowerCase() + "\t" + element.substring(0, 1).toLowerCase() + elemid + "\t1\t" + attrname + "\t" + attrvalue + "\n");
}
}
}
}
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + " (Reading line " + linen + ")\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
public static Boolean generate_tab_extents_and_attribs_with_real_id(PipesFile pipesfile, String filenamebase, String id_re) {
int linen = 0;
try {
int iob2col = pipesfile.getColumn("element\\(IOB2\\)");
if (iob2col == -1) {
iob2col = pipesfile.getLastDescColumn();
}
int attrscol = iob2col + 1;
BufferedWriter extentsfile = new BufferedWriter(new FileWriter(pipesfile.getFile().getParent() + filenamebase + "-extents.tab"));
BufferedWriter attribsfile = new BufferedWriter(new FileWriter(pipesfile.getFile().getParent() + filenamebase + "-attributes.tab"));
BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile()));
try {
String pipesline;
String[] pipesarr = null;
String elemid = "unkonwn"; // save id per file
String curr_fileid = "";
while ((pipesline = pipesreader.readLine()) != null) {
linen++;
pipesarr = pipesline.split("\\|");
if (!curr_fileid.equals(pipesarr[0])) {
elemid = "unknown";
curr_fileid = pipesarr[0];
}
//System.err.println(pipesline + " ----> "+pipesarr[iob2col]+" attrcol ("+attrscol+") l="+pipesarr.length);
// Include extents for B- and I- elements
if (!pipesarr[iob2col].equals("O")) {
String element = pipesarr[iob2col].substring(2);
// Include only attribs for B-
if (pipesarr[iob2col].substring(0, 2).equalsIgnoreCase("B-")) {
// check there are attribs (at least id)
if (pipesarr[attrscol].equals("-") || pipesarr[attrscol].equals("*")) {
throw new Exception("Found B-element without attribs");
}
HashMap<String, String> attribs = XmlAttribs.parseAttrs(pipesarr[attrscol]);
Boolean id_found = false;
for (String current_attrib : attribs.keySet()) {
if (current_attrib.matches(id_re)) {
elemid = attribs.get(current_attrib);
id_found = true;
break;
}
}
if (!id_found) {
throw new Exception("All the elements must have an ID (" + id_re + "). Line: " + pipesline);
}
for (String current_attrib : attribs.keySet()) {
if (!current_attrib.matches(id_re)) {
attribsfile.write(pipesarr[0] + "\t" + pipesarr[1] + "\t" + pipesarr[2] + "\t" + element.toLowerCase() + "\t" + elemid + "\t1\t" + current_attrib + "\t" + attribs.get(current_attrib) + "\n");
}
}
}
//outfile.write(pipesarr[0] + "\t" + pipesarr[1] + "\t" + pipesarr[2].substring(0, pipesarr[2].indexOf('-')) + "\t" + element.toLowerCase() + "\t" + element.substring(0, 1).toLowerCase() + elemid + "\t1\n");
extentsfile.write(pipesarr[0] + "\t" + pipesarr[1] + "\t" + pipesarr[2] + "\t" + element.toLowerCase() + "\t" + elemid + "\t1\n");
}
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (extentsfile != null) {
extentsfile.close();
}
if (attribsfile != null) {
attribsfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + " (Reading line " + linen + ")\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return false;
}
return true;
}
// generate tab links: cut -f 1-3,last -d "|" --> save in tab format
public static String generate_tab_links(PipesFile pipesfile) {
String outputfile = null;
try {
outputfile = pipesfile.getFile().getCanonicalPath() + "-links.tab";
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile()));
try {
String pipesline;
String[] pipesarr = null;
while ((pipesline = pipesreader.readLine()) != null) {
pipesarr = pipesline.split("\\|");
outfile.write(pipesarr[0] + "\t" + pipesarr[1] + "\t" + pipesarr[2] + "\t" + pipesarr[pipesarr.length - 1] + "\n");
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
public static String generate_base_segmentation(String file) {
String outputfile = null;
int linen = 0;
try {
outputfile = file.substring(0, file.indexOf(".plain")) + ".plain.tab";
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader pipesreader = new BufferedReader(new FileReader(file));
try {
String pipesline;
String[] pipesarr = null;
while ((pipesline = pipesreader.readLine()) != null) {
linen++;
pipesarr = pipesline.split("\\|");
outfile.write(pipesarr[0] + "\t" + pipesarr[1] + "\t" + pipesarr[2] + "\t" + pipesarr[3] + "\n");
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + " (" + linen + ")\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
/**
* Saves the dct in a standard dct.tab file
*
* @param tmlfile
* @return the path to the file
*/
public static String tml2dct_tab(String tmlfile) {
String outputfile = null;
try {
outputfile = tmlfile.substring(0, tmlfile.lastIndexOf('/') + 1) + "dct.tab";
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile, true));
BufferedReader tmlreader = new BufferedReader(new FileReader(tmlfile.substring(0, tmlfile.indexOf(".tml", tmlfile.lastIndexOf('/'))) + ".tml"));
try {
String[] dct = new String[3]; // tid, value
String line;
while ((line = tmlreader.readLine()) != null) {
if (line.matches(".*tid=.*")
&& line.matches(".*value.*")
&& line.matches(".*functionInDocument=\"(CREATION|PUBLICATION)_TIME\".*")
) {
dct[0] = tmlfile.substring(tmlfile.lastIndexOf('/') + 1, tmlfile.indexOf(".tml", tmlfile.lastIndexOf('/')));
dct[1] = line.substring(line.indexOf("value=\"") + 7, line.indexOf("\"", line.indexOf("value=\"") + 7));
dct[2] = line.substring(line.indexOf("tid=\"") + 5, line.indexOf("\"", line.indexOf("tid=\"") + 5));
break;
}
}
if (dct[0] == null) {
throw new Exception("Reference date (dct) not found as CREATION_TIME/PUBLICATION_TIME: "+tmlfile);
}
outfile.write(dct[0] + ".tml.plain\t" + dct[1] + "\t" + dct[2] + "\n");
} finally {
if (tmlreader != null) {
tmlreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEvalFiles):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
//System.exit(1);
}
return outputfile;
}
return outputfile;
}
/**
* Returns and array with [value,tid] pairs from a dct.tab
*
* @param dctsTabFile
* @return
*/
public static HashMap<String, String[]> getDCTsFromTab(String dctsTabFile) {
HashMap<String, String[]> DCTs = null;
try {
if (!(new File(dctsTabFile)).exists()) {
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
System.err.println(dctsTabFile + " does not exist.");
}
return null;
}
BufferedReader dctreader = new BufferedReader(new FileReader(dctsTabFile));
try {
String line;
DCTs = new HashMap<String, String[]>();
while ((line = dctreader.readLine()) != null) {
String[] linearr = line.split("\t");
if (linearr[1].matches("[0-9]{8}")) {
linearr[1] = linearr[1].substring(0, 4) + "-" + linearr[1].substring(4, 6) + "-" + linearr[1].substring(6, 8);
}
if (linearr.length == 2) {
DCTs.put(linearr[0], new String[]{linearr[1], "t0"});
}
if (linearr.length == 3) {
DCTs.put(linearr[0], new String[]{linearr[1], linearr[2]});
}
}
} finally {
if (dctreader != null) {
dctreader.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEvalFiles):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return DCTs;
}
}