package com.cognitionis.nlp_files;
import com.cognitionis.utils_basickit.StringUtils;
import com.cognitionis.utils_basickit.FileUtils;
import java.io.*;
/**
*
* @author Héctor Llorens
* @since 2011
*/
public class PipesFile extends NLPFile {
/*
* Pipes description array
*/
private String[] pipes_desc_arr;
private Integer pipes_desc_arr_count;
public Boolean isWellFormed;
public PipesFile(String filename) {
super(filename);
pipes_desc_arr = null;
pipes_desc_arr_count = 0;
isWellFormed = false;
}
public Boolean isWellFormedOptimist() {
try {
if (extension.matches("\\s*")) {
throw new Exception("PipesFile must have an extension (i.e., .tok)");
}
if (extension.contains("annotatedWith")) {
extension = extension.substring(0, extension.lastIndexOf("-annotatedWith") + 14);
}
if (extension.contains("annotationKey")) {
extension = extension.substring(0, extension.lastIndexOf("-annotationKey") + 14);
}
File pipes_desc_file = new File(this.f.getCanonicalPath().substring(0, this.f.getCanonicalPath().lastIndexOf('/') + 1) + extension + ".pipes-desc");
if (!pipes_desc_file.exists() || !pipes_desc_file.isFile()) {
//System.out.println(FileUtils.getApplicationPath() + FileUtils.NLPFiles_descr_path + extension + ".pipes-desc");
pipes_desc_file = new File(FileUtils.getApplicationPath() + FileUtils.NLPFiles_descr_path + extension + ".pipes-desc");
if (!pipes_desc_file.exists() || !pipes_desc_file.isFile()) {
throw new Exception("PipesFile description file (" + extension + ".pipes-desc) not found in " + pipes_desc_file.getCanonicalPath());
}
}
// read pipes desc, count fields (cols)
BufferedReader reader = new BufferedReader(new FileReader(pipes_desc_file));
try {
String line = null;
int linen = 0;
while ((line = reader.readLine()) != null) {
linen++; //System.getProperty("line.separator")
if (line.equals("word")) {
pipes_desc_arr = new String[1];
pipes_desc_arr[0] = "word";
pipes_desc_arr_count = 1;
break;
}
if (line.matches(".*\\|.*")) {
pipes_desc_arr = line.split("\\|");
pipes_desc_arr_count = pipes_desc_arr.length;
break;
}
}
} finally {
if (reader != null) {
reader.close();
}
}
if (pipes_desc_arr == null) {
throw new Exception(pipes_desc_file + " is not a valid PipesFile description file");
}
} catch (Exception e) {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return false;
}
this.isWellFormed = true;
return true;
}
public Boolean isWellFormatted() {
try {
if (extension.matches("\\s*")) {
throw new Exception("PipesFile must have an extension (i.e., .tok)");
}
if (extension.contains("annotatedWith")) {
extension = extension.substring(0, extension.lastIndexOf("-annotatedWith") + 14);
}
if (extension.contains("annotationKey")) {
extension = extension.substring(0, extension.lastIndexOf("-annotationKey") + 14);
}
File pipes_desc_file = new File(this.f.getCanonicalPath().substring(0, this.f.getCanonicalPath().lastIndexOf('/') + 1) + extension + ".pipes-desc");
if (!pipes_desc_file.exists() || !pipes_desc_file.isFile()) {
pipes_desc_file = new File(FileUtils.getApplicationPath() + FileUtils.NLPFiles_descr_path + extension + ".pipes-desc");
if (!pipes_desc_file.exists() || !pipes_desc_file.isFile()) {
throw new Exception("PipesFile description file (" + extension + ".pipes-desc) not found in " + pipes_desc_file.getCanonicalPath());
}
}
// read pipes desc, count fields (cols)
BufferedReader reader = new BufferedReader(new FileReader(pipes_desc_file));
try {
String line = null;
int linen = 0;
while ((line = reader.readLine()) != null) {
linen++; //System.getProperty("line.separator")
if (line.matches(".*\\|.*")) {
pipes_desc_arr = line.split("\\|");
pipes_desc_arr_count = pipes_desc_arr.length;
break;
}
}
} finally {
if (reader != null) {
reader.close();
}
}
if (pipes_desc_arr == null) {
throw new Exception(pipes_desc_file + " is not a valid PipesFile description file");
}
// read pipesFile line by line check that at least it have X piped cols
reader = new BufferedReader(new FileReader(this.f));
try {
String line = null;
String lastline = null;
int linen = 0;
while ((line = reader.readLine()) != null) {
linen++; //System.getProperty("line.separator")
lastline = line;
if (line.length() > 0 && line.matches("[^\\|]+")) {
throw new Exception("Line " + linen + " is not valid pipesFile line: Has contet without |");
}
if (line.length() > 1 && line.matches(".*\\|.*")) { // it permits a | as an empty line
if (line.split("\\|").length < this.pipes_desc_arr_count) {
throw new Exception("Line " + linen + " is not valid pipesFile line: Has less columns (" + line.split("\\|").length + ") than description file(" + pipes_desc_arr_count + ")");
}
}
}
if (!extension.matches("(tab|pipes.*|TempEval.*|roth-.*|srlpaired.*|poslemma.*)") && !lastline.trim().equals("|")) {
throw new Exception("Last line does not end with an end sentence marker |. (DEPRECATED... MODIFY..)");
}
} finally {
if (reader != null) {
reader.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return false;
}
this.isWellFormed = true;
return true;
}
public String[] getPipesDescArr() {
return this.pipes_desc_arr;
}
public int getPipesDescArrCount() {
return this.pipes_desc_arr_count;
}
/**
* Returns the pipesfile splited in sentences by empty |
*
* @return outputfilename
*/
public String sentSplit() {
String outputfile = this.getFile().toString() + ".pipes";
int numline = 0;
try {
BufferedReader pipesreader = new BufferedReader(new FileReader(f));
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
int sentcolumn = 1;
int filecolumn = 0;
try {
String line;
String numsent = "-1";
String filename = "-1";
while ((line = pipesreader.readLine()) != null) {
numline++;
String[] linearr = line.split("\\|");
if ((!filename.equals(linearr[filecolumn]) || !numsent.equals(linearr[sentcolumn])) && !numsent.equals("-1") && !filename.equals("-1")) {
outfile.write("|\n");
}
for (int i = 3; i < linearr.length - 1; i++) {
outfile.write(linearr[i] + "|");
}
outfile.write(linearr[linearr.length - 1] + "\n");
numsent = linearr[sentcolumn];
filename = linearr[filecolumn];
}
outfile.write("|\n");
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + " (line " + numline + ")\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
public String detectLanguage() {
return "en";
}
public String toPlain() {
String outputfile = this.getFile().toString() + ".plain";
// first look for any paired column (i.e., leading_blanks, offset)
// if found follow it
// else one token, one space, one token, one space... (end of sentence -> \n)
try {
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
if (!this.isWellFormed) {
throw new Exception("Malformed pipes file");
}
int tokcolumn = this.getColumn("(token|word).*");
int blankscolumn = this.getColumn("(leading.?blanks?|blanks)");
int sentnumcolumn = this.getColumn("sent-num");
BufferedReader pipesreader = new BufferedReader(new FileReader(this.f));
try {
String line;
String outputline = "";
int current_sentence = 0; // (from 0 to n)
while ((line = pipesreader.readLine()) != null) {
String[] linearr = line.split("\\|");
if (linearr.length >= this.pipes_desc_arr_count) {
String token = linearr[tokcolumn];
if (this.extension.equalsIgnoreCase("TempEval-bs")) {
//System.err.println("sentnum="+linearr[sentnumcolumn]+" token="+token);
if (token.matches("(\\*.*|0)")) {
continue;
}
if (token.matches("-.+-")) {
if (token.equals("-LRB-")) {
token = "(";
}
if (token.equals("-RRB-")) {
token = ")";
}
if (token.equals("-LSB-")) {
token = "[";
}
if (token.equals("-RSB-")) {
token = "]";
}
if (token.equals("-LCB-")) {
token = "{";
}
if (token.equals("-RCB-")) {
token = "}";
}
}
if ((sentnumcolumn != -1) && (Integer.parseInt(linearr[sentnumcolumn]) != current_sentence)) {
current_sentence = Integer.parseInt(linearr[sentnumcolumn]);
outfile.write(outputline + "\n");
outputline = token;
} else {
if (!outputline.equals("")) {
outputline += " ";
}
outputline += token;
}
} else {
if (blankscolumn != -1) {
for (int i = 1; i <= Integer.parseInt(linearr[blankscolumn]); i++) {
outputline += " ";
}
} else {
if (!outputline.equals("")) {
outputline += " ";
}
}
outputline += token;
}
} else {
current_sentence++;
outfile.write(outputline + "\n");
outputline = "";
}
}
// write last sentence
if (!outputline.equals("")) {
outfile.write(outputline + "\n");
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
public String pair_pipes_by_column_JOIN(int paircol, String model, int modelcol) {
String outputfile = this.getFile().toString() + ".paired";
try {
if (!this.isWellFormed) {
throw new Exception("Malformed pipes file");
}
if (paircol < 0 || this.pipes_desc_arr_count < paircol) {
throw new Exception("Paircol (" + paircol + ") does not exist");
}
PipesFile modelpipes = new PipesFile(model);
modelpipes.isWellFormedOptimist();
if (modelcol < 0 || modelpipes.getPipesDescArrCount() < modelcol) {
throw new Exception("Modelcol (" + modelcol + ") does not exist (total: " + modelpipes.getPipesDescArrCount() + ") ");
}
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader modelreader = new BufferedReader(new FileReader(model));
BufferedReader pipesreader = new BufferedReader(new FileReader(this.f));
try {
String line;
String pairline = "";
String[] pairarr = null;
String pipespair = "";
Boolean paired = true;
while ((line = modelreader.readLine()) != null) {
String[] linearr = line.split("\\|");
if (linearr.length >= modelpipes.getPipesDescArrCount()) {
String modelpair = linearr[modelcol];
if (paired) {
pairline = pipesreader.readLine();
// handel Freeling/Treetager/etc. newlines
while (pairline != null && (pairline.equals("|") || pairline.trim().equals(""))) {
pairline = pipesreader.readLine();
}
if (pairline != null) {
pairarr = pairline.split("\\|");
pipespair = pairarr[paircol];
}
}
if (modelpair.equals(pipespair)) {
paired = true;
} else {
//System.out.println("Model:(" + modelpair + ") Pair(" + pipespair + ")");
if (modelpair.equals("\"") && (pipespair.equals("``") || pipespair.equals("''"))) {
paired = true;
} else {
if (modelpair.matches("-.+-")) {
if ((modelpair.equals("-LRB-") && pipespair.equals("(")) || (modelpair.equals("-RRB-") && pipespair.equals(")"))) {
paired = true;
} else {
if ((modelpair.equals("-LSB-") && pipespair.equals("[")) || (modelpair.equals("-RSB-") && pipespair.equals("]"))) {
paired = true;
} else {
if ((modelpair.equals("-LCB-") && pipespair.equals("{")) || (modelpair.equals("-RCB-") && pipespair.equals("}"))) {
paired = true;
} else {
paired = false;
}
}
}
} else {
paired = false;
}
}
}
}
//System.out.println("paired output: " + line);
outfile.write(line);
if (modelpipes.getPipesDescArrCount() > 1) {
for (int i = 0; i < this.pipes_desc_arr_count; i++) {
if (i != paircol) {
if (paired && pairarr != null) {
outfile.write("|" + pairarr[i]);
} else {
outfile.write("|-");
}
}
}
}
if (paired) {
pairarr = null;
pairline = "";
pipespair = "";
}
outfile.write("\n");
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (modelreader != null) {
modelreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
// Para hacerlo bien en el futuro lo mejor seria tener un cierto buffer de memoria... para enparejar grupos de palabras... (5) pej
public String pair_ancora2pipes(int modelcol, String AnCoraPath, int paircol) {
String outputfile = this.getFile().toString() + ".roth-treetag";
try {
if (!this.isWellFormed) {
throw new Exception("Malformed pipes file");
}
if (modelcol < 0 || this.pipes_desc_arr_count < modelcol) {
throw new Exception("Pairtextcol (" + modelcol + ") does not exist (total: " + this.pipes_desc_arr_count + ") ");
}
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader modelreader = new BufferedReader(new FileReader(this.f));
try {
int syntcolumn = -1;
int lemmacolumn = -1;
String accomulatedsynt = "";
String line;
String pairline = "";
String[] pairarr = null;
String pipespair = "";
Boolean inMultiword = false;
Boolean paired = true;
String currentpairfile = "";
BufferedReader pipesreader = null;
PipesFile pairpipes = null;
while ((line = modelreader.readLine()) != null) {
String[] linearr = line.split("\\|");
String pairfile = AnCoraPath + linearr[0].substring(0, linearr[0].lastIndexOf('.') + 1) + "tbf.utf8.roth-treetag";
//SEGUN VAMOS LEYENDO LINEAS VAMOS DECIDIENDO QUE FICHERO ENPAREJAMOS...
if (!(new File(pairfile).exists()) && pairfile.matches(".*_[a-z]\\.tbf\\.utf8\\.roth-treetag")) {
pairfile = pairfile.replaceFirst("_[a-z]\\.tbf", "\\.tbf");
}
if (!currentpairfile.equals(pairfile)) {
currentpairfile = pairfile;
if (pipesreader != null) {
pipesreader.close();
}
pairpipes = new PipesFile(pairfile);
pairpipes.isWellFormedOptimist();
if (paircol < 0 || pairpipes.getPipesDescArrCount() < paircol) {
throw new Exception("Paircol (" + paircol + ") does not exist");
}
pipesreader = new BufferedReader(new FileReader(pairfile));
syntcolumn = pairpipes.getColumn("synt");
lemmacolumn = pairpipes.getColumn("lemma");
System.out.println(pairfile + "(synt " + syntcolumn + ")(lemma " + lemmacolumn + ")");
}
/*
if(linearr[1].equals("2")){
System.exit(0);
}
*/
if (linearr.length >= this.pipes_desc_arr_count) {
String modelpair = linearr[modelcol];
if (paired) {
pairline = pipesreader.readLine();
while (pairline != null && pairline.equals("|")) {
pairline = pipesreader.readLine();
}
if (pairline != null) {
pairarr = pairline.split("\\|");
pipespair = pairarr[paircol];
}
}
if (modelpair.equalsIgnoreCase(pipespair) || (modelpair.equalsIgnoreCase("Barça") && pipespair.equalsIgnoreCase("Barca"))) {
paired = true;
inMultiword = false;
System.out.println("pairing: " + pipespair);
} else {
// HACKs
System.out.println("not Equal " + pairline + " - " + line + "\n");
/*if (pipespair.equals("Madrid") && linearr[0].equals("107_19991001.txt") && linearr[1].equals("1") && linearr[2].equals("1")) {
pairline = pipesreader.readLine();
if (pairline != null) {
pairarr = pairline.split("\\|");
pipespair = pairarr[paircol];
} else {
System.out.println("End of file reached...");
System.exit(1);
}
}*/
if (pipespair.equals("(")) {
System.out.println("NOT!! Find next braket ) -->");
int parlevel = 0;
parlevel += StringUtils.countOccurrencesOf(pairarr[syntcolumn], '(');
parlevel -= StringUtils.countOccurrencesOf(pairarr[syntcolumn], ')');
while (pairline != null && !pipespair.equals(")")) {
pairline = pipesreader.readLine();
if (pairline != null) {
pairarr = pairline.split("\\|");
pipespair = pairarr[paircol];
} else {
System.out.println("End of file reached...");
System.exit(1);
}
parlevel += StringUtils.countOccurrencesOf(pairarr[syntcolumn], '(');
parlevel -= StringUtils.countOccurrencesOf(pairarr[syntcolumn], ')');
}
if (parlevel < 0) {
for (int p = 0; p > parlevel; p--) {
accomulatedsynt += ")";
}
}
pairline = pipesreader.readLine();
if (pairline != null) {
pairarr = pairline.split("\\|");
pipespair = pairarr[paircol];
} else {
System.out.println("End of file reached...while looking for )");
System.exit(1);
}
}
// Avoid symbols... when ulikely sentence/sent-part avoid
if (pipespair.matches("[\",%-]") && !modelpair.matches("[\",%-]") && !pairarr[syntcolumn].matches(".*\\((inc|nominalSent)((\\s+|\\().*)?")) {
System.out.println("NOT!! Omit symbols ) -->");
while (pairline != null && !pipespair.equals(modelpair)) {
accomulatedsynt += pairarr[syntcolumn];
Boolean sentend = false;
if (pairline.equals("|")) {
sentend = true;
}
pairline = pipesreader.readLine();
if (pairline != null) {
if (!pairline.equals("|")) {
pairarr = pairline.split("\\|");
pipespair = pairarr[paircol];
} else {
pipespair = "|";
}
} else {
System.out.println("End of file reached...");
System.exit(1);
}
if (sentend) {
accomulatedsynt = "";
break;
}
}
}
/*if (pipespair.equals("SERGI LOPEZ-EGEA") && linearr[0].equals("107_20000701.txt") && linearr[1].equals("4") && linearr[2].equals("12")) {
pairline = pipesreader.readLine();
pairline = pipesreader.readLine();
pairline = pipesreader.readLine();
System.out.println("HACK) -->");
pairarr = pairline.split("\\|");
pipespair = pairarr[paircol];
pairline = pipesreader.readLine();
}*/
if (modelpair.equals(pipespair)) {
paired = true;
inMultiword = false;
} else {
paired = false;
//if (modelpair.matches("-.+-")) {
// System.out.println("modelpair:"+modelpair+" - pipespair:"+pipespair);
// }else{
System.out.println("NOT!! modelpair:" + modelpair + " - pipespair:" + pipespair);
// if (pipespair.contains("_")) {
String trypair = pipespair;
if (pipespair.contains("_")) {
trypair = pipespair.substring(0, pipespair.indexOf('_'));
}
System.out.println("Try:" + trypair);
if (trypair.equals(modelpair)) {
inMultiword = true;
pipespair = pipespair.substring(pipespair.indexOf('_') + 1);
} else {
while (!paired && pairarr[syntcolumn].matches(".*\\((inc|nominalSent)((\\s+|\\().*)?")) {
// Remove INC
int parlevel = 0;
int incparlevel = 1000;
int nomparlevel = 1000;
String accomulatedinc = "";
String accomulatednorm = "";
if (pairarr[syntcolumn].matches(".*\\(inc((\\s+|\\().*)?")) {
incparlevel = StringUtils.countOccurrencesOf(pairarr[syntcolumn].substring(0, pairarr[syntcolumn].indexOf("(inc")), '(');
accomulatedinc = pairarr[syntcolumn].substring(0, pairarr[syntcolumn].indexOf("(inc"));
}
if (pairarr[syntcolumn].matches(".*\\(nominalSent((\\s+|\\().*)?")) {
nomparlevel = StringUtils.countOccurrencesOf(pairarr[syntcolumn].substring(0, pairarr[syntcolumn].indexOf("(nominalSent")), '(');
accomulatednorm = pairarr[syntcolumn].substring(0, pairarr[syntcolumn].indexOf("(nominalSent"));
}
if (nomparlevel < incparlevel) {
incparlevel = nomparlevel;
accomulatedsynt += accomulatednorm;
} else {
accomulatedsynt = accomulatedinc;
}
parlevel += StringUtils.countOccurrencesOf(pairarr[syntcolumn], '(');
parlevel -= StringUtils.countOccurrencesOf(pairarr[syntcolumn], ')');
System.out.println("REMOVE INC/nominalsent=" + incparlevel);
while (pairline != null) {
System.out.println("palevel=" + parlevel + " - " + pairline + " ---(" + modelpair);
if (parlevel <= incparlevel) {
if (parlevel < incparlevel) {
for (int p = incparlevel; p > parlevel; p--) {
accomulatedsynt += ")";
}
}
System.out.println("END INC");
pairline = pipesreader.readLine();
while (pairline != null && pairline.equals("|")) {
pairline = pipesreader.readLine();
}
if (pairline != null) {
pairarr = pairline.split("\\|");
pipespair = pairarr[paircol];
} else {
System.out.println("INC End of file reached...");
System.exit(1);
}
break;
}
pairline = pipesreader.readLine();
if (pairline != null && !pairline.equals("|")) {
pairarr = pairline.split("\\|");
pipespair = pairarr[paircol];
parlevel += StringUtils.countOccurrencesOf(pairarr[syntcolumn], '(');
parlevel -= StringUtils.countOccurrencesOf(pairarr[syntcolumn], ')');
} else {
System.out.println("INC End of file/sentence reached...");
System.exit(1);
}
}
if (modelpair.equals(pipespair)) {
paired = true;
inMultiword = false;
} else {
if (pipespair.contains("_")) {
trypair = pipespair.substring(0, pipespair.indexOf('_'));
System.out.println("Try:" + trypair);
if (trypair.equals(modelpair)) {
inMultiword = true;
pipespair = pipespair.substring(pipespair.indexOf('_') + 1);
}
}
}
}
if (!paired && !inMultiword && !modelpair.equals("\"")) { // && !modelpair.equals("\"")) {
System.out.println("NOT!! Find next sentence");
accomulatedsynt = "";
//System.exit(1);
while (pairline != null && !pairline.equals("|")) {
pairline = pipesreader.readLine();
}
while (pairline != null && pairline.equals("|")) {
pairline = pipesreader.readLine();
}
if (pairline != null) {
pairarr = pairline.split("\\|");
pipespair = pairarr[paircol];
} else {
System.out.println("End of file reached...");
System.exit(1);
}
if (modelpair.equals(pipespair)) {
paired = true;
inMultiword = false;
} else {
if (pipespair.contains("_")) {
trypair = pipespair.substring(0, pipespair.indexOf('_'));
System.out.println("Try:" + trypair);
if (trypair.equals(modelpair)) {
inMultiword = true;
pipespair = pipespair.substring(pipespair.indexOf('_') + 1);
}
}
}
}
while (!paired && pairarr[syntcolumn].matches(".*\\((inc|nominalSent)((\\s+|\\().*)?")) {
// Remove INC
int parlevel = 0;
int incparlevel = 1000;
int nomparlevel = 1000;
String accomulatedinc = "";
String accomulatednorm = "";
if (pairarr[syntcolumn].matches(".*\\(inc((\\s+|\\().*)?")) {
incparlevel = StringUtils.countOccurrencesOf(pairarr[syntcolumn].substring(0, pairarr[syntcolumn].indexOf("(inc")), '(');
accomulatedinc = pairarr[syntcolumn].substring(0, pairarr[syntcolumn].indexOf("(inc"));
}
if (pairarr[syntcolumn].matches(".*\\(nominalSent((\\s+|\\().*)?")) {
nomparlevel = StringUtils.countOccurrencesOf(pairarr[syntcolumn].substring(0, pairarr[syntcolumn].indexOf("(nominalSent")), '(');
accomulatednorm = pairarr[syntcolumn].substring(0, pairarr[syntcolumn].indexOf("(nominalSent"));
}
if (nomparlevel < incparlevel) {
incparlevel = nomparlevel;
accomulatedsynt += accomulatednorm;
} else {
accomulatedsynt = accomulatedinc;
}
parlevel += StringUtils.countOccurrencesOf(pairarr[syntcolumn], '(');
parlevel -= StringUtils.countOccurrencesOf(pairarr[syntcolumn], ')');
System.out.println("REMOVE INC/nominalsent=" + incparlevel);
while (pairline != null) {
System.out.println("palevel=" + parlevel + " - " + pairline + " ---(" + modelpair);
if (parlevel <= incparlevel) {
if (parlevel < incparlevel) {
for (int p = incparlevel; p > parlevel; p--) {
accomulatedsynt += ")";
}
}
pairline = pipesreader.readLine();
while (pairline != null && pairline.equals("|")) {
pairline = pipesreader.readLine();
}
if (pairline != null) {
pairarr = pairline.split("\\|");
pipespair = pairarr[paircol];
} else {
System.out.println("INC End of file reached...");
System.exit(1);
}
break;
}
pairline = pipesreader.readLine();
if (pairline != null && !pairline.equals("|")) {
pairarr = pairline.split("\\|");
pipespair = pairarr[paircol];
parlevel += StringUtils.countOccurrencesOf(pairarr[syntcolumn], '(');
parlevel -= StringUtils.countOccurrencesOf(pairarr[syntcolumn], ')');
} else {
System.out.println("INC End of file/sentence reached...");
System.exit(1);
}
}
if (modelpair.equals(pipespair)) {
paired = true;
inMultiword = false;
} else {
if (pipespair.contains("_")) {
trypair = pipespair.substring(0, pipespair.indexOf('_'));
System.out.println("Try:" + trypair);
if (trypair.equals(modelpair)) {
inMultiword = true;
pipespair = pipespair.substring(pipespair.indexOf('_') + 1);
}
}
}
}
if (!paired && !inMultiword && !modelpair.equals("\"")) { // && !modelpair.equals("\"")) {
System.out.println("NOT!! Find next sentence");
accomulatedsynt = "";
//System.exit(1);
while (pairline != null && !pairline.equals("|")) {
pairline = pipesreader.readLine();
}
while (pairline != null && pairline.equals("|")) {
pairline = pipesreader.readLine();
}
if (pairline != null) {
pairarr = pairline.split("\\|");
pipespair = pairarr[paircol];
} else {
System.out.println("End of file reached...");
System.exit(1);
}
if (modelpair.equals(pipespair)) {
paired = true;
inMultiword = false;
} else {
if (pipespair.contains("_")) {
trypair = pipespair.substring(0, pipespair.indexOf('_'));
System.out.println("Try:" + trypair);
if (trypair.equals(modelpair)) {
inMultiword = true;
pipespair = pipespair.substring(pipespair.indexOf('_') + 1);
}
}
}
}
}
}
}
}
//System.out.println("paired output: " + line);
outfile.write(line);
if (pairpipes.getPipesDescArrCount() > 1) {
for (int i = 0; i < pairarr.length; i++) {
if (i != paircol) {
if (paired && pairarr != null) {
if (i == syntcolumn) {
outfile.write("|" + accomulatedsynt + pairarr[i]);
} else {
outfile.write("|" + pairarr[i]);
}
} else {
if (inMultiword && pairarr != null) {
if (i == lemmacolumn) {
if (!pairarr[i].contains("_")) {
System.out.println("la caca maxima=" + pairarr[i]);
System.exit(1);
}
outfile.write("|" + pairarr[i].substring(0, pairarr[i].indexOf('_')));
pairarr[i] = pairarr[i].substring(pairarr[i].indexOf('_') + 1);
} else {
if (i == syntcolumn) {
if (pairarr[i].equals("*") || pairarr[i].equals("*)")) {
outfile.write("|" + accomulatedsynt + "*");
} else {
outfile.write("|" + accomulatedsynt + pairarr[i].substring(0, pairarr[i].lastIndexOf('*') + 1));
pairarr[i] = pairarr[i].substring(pairarr[i].lastIndexOf('*'));
}
} else {
if (i >= pairpipes.getPipesDescArrCount()) {
if (pairarr[i].equals("*")) {
outfile.write("|*");
} else {
outfile.write("|" + pairarr[i].substring(0, pairarr[i].lastIndexOf('*') + 1));
pairarr[i] = pairarr[i].substring(pairarr[i].lastIndexOf('*'));
}
} else {
outfile.write("|-");
}
}
}
} else {
if (i >= pairpipes.getPipesDescArrCount()) {
outfile.write("|*");
} else {
if (i == syntcolumn) {
outfile.write("|" + accomulatedsynt + "-");
} else {
outfile.write("|-");
}
}
}
}
}
}
accomulatedsynt = "";
}
if (paired) {
pairarr = null;
pairline = "";
pipespair = "";
}
outfile.write("\n");
}
} finally {
if (modelreader != null) {
modelreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
public String pair2plain(String plainmodel) {
String outputfile = this.getFile().toString() + ".pre";
try {
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
if (!this.isWellFormed) {
throw new Exception("Malformed pipes file");
}
int tokcolumn = this.getColumn("(tok|word).*");
//System.out.println("Pairing column=" + tokcolumn + " " + this.pipes_desc_arr[tokcolumn]);
BufferedReader modelreader = new BufferedReader(new FileReader(plainmodel));
BufferedReader pipesreader = new BufferedReader(new FileReader(this.f));
try {
String line;
char cmodel = '\0';
int offset = -1;
boolean readmodel = true;
while ((line = pipesreader.readLine()) != null) {
String[] linearr = line.split("\\|");
if (linearr.length >= this.pipes_desc_arr_count) {
String token = linearr[tokcolumn];
int token_offset = -1;
int token_leading_blanks = 0;
String paired_token = "";
for (int cn = 0; cn < token.length(); cn++) {
char cpipes = token.charAt(cn);
if (readmodel) {
if ((cmodel = (char) modelreader.read()) == -1) {
throw new Exception("Premature end of model file");
}
offset++;
} else {
readmodel = true;
}
//System.out.println("offset=" + offset + " cmodel(" + cmodel + ") cpipes(" + cpipes + ")");
if (Character.toLowerCase(cpipes) == Character.toLowerCase(cmodel)) {
paired_token += cmodel;
if (token_offset == -1) {
token_offset = offset;
}
// multi-dashes problem
if (cmodel == '-' && cn == token.length() - 1) {
// read a new char (cmodel) if not end of file to check multi-dash
if (!((cmodel = (char) modelreader.read()) == -1)) {
readmodel = false;
offset++;
if (cmodel == '-') {
cn--;
}
if (cmodel == ' ' || cmodel == '\n' || cmodel == '\r' || cmodel == '\t') {
cn++;
readmodel = true;
}
}
}
} else {
//if (cmodel == ' ' || cmodel == '\n' || cmodel == '\r') {
if (cmodel == ' ') {
cn--;
if (token_offset == -1) {
token_leading_blanks++;
}
} else {
// Special for quotes
if (cmodel == '"' && ((cpipes == '`') || (cpipes == '\''))) {
if (cn + 1 < token.length() && cpipes == token.charAt(cn + 1)) {
cn += 2;
paired_token += cmodel;
}
} else {
if (((cmodel == '\'' || cmodel == '`') && (cpipes == '`' || cpipes == '\'')) || (cmodel == '—') || (cmodel == '£')) {
paired_token += cmodel;
} else {
throw new Exception("Distinct chars " + "offset=" + offset + " cmodel(" + cmodel + ") cpipes(" + cpipes + ")");
}
}
}
}
}
//System.out.print("paired output: ");
for (int i = 0; i < linearr.length - 1; i++) {
if (i == tokcolumn) {
//System.out.print(paired_token + "|" + offset + "|");
//System.out.print(paired_token + "|" + token_leading_blanks + "|");
outfile.write(paired_token + "|" + token_leading_blanks + "|");
} else {
//System.out.print(linearr[i] + "|");
outfile.write(linearr[i] + "|");
}
}
//System.out.println(linearr[linearr.length - 1]);
outfile.write(linearr[linearr.length - 1] + "\n");
} else { // newline new sentence
//System.out.println("cmodel(" + cmodel + ")");
if (Character.toLowerCase(cmodel) != '\n' && Character.toLowerCase(cmodel) != '\r') {
if ((cmodel = (char) modelreader.read()) != (char) -1) {
offset++;
if (Character.toLowerCase(cmodel) != '\n' && Character.toLowerCase(cmodel) != '\r') {
throw new Exception("End of line not found (n) " + "offset=" + offset + ". cmodel(" + cmodel + ") found instead.");
} else {
if (Character.toLowerCase(cmodel) == '\r') {
if ((cmodel = (char) modelreader.read()) != (char) -1) {
offset++;
if (Character.toLowerCase(cmodel) != '\n') {
throw new Exception("End of line not found (rn)" + "offset=" + offset + ". cmodel(" + cmodel + ") found instead.");
}
}
}
}
}
} else {
if (Character.toLowerCase(cmodel) == '\r') {
if ((cmodel = (char) modelreader.read()) != (char) -1) {
offset++;
if (Character.toLowerCase(cmodel) != '\n') {
throw new Exception("End of line not found (rn) " + "offset=" + offset + ". cmodel(" + cmodel + ") found instead.");
}
}
}
}
//System.out.println("paired output: " + line);
outfile.write(line + "\n");
}
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (modelreader != null) {
modelreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
public String merge_tok_n_xml(String tmlfile, String root_tag, String elements_re, String attribs_re, String mergeattrib) {
String outputfile = this.getFile().toString() + "-annotationKey";
if (!elements_re.equals(".*")) {
outputfile += "-" + elements_re;
}
if (!attribs_re.equals(".*")) {
outputfile += "-" + attribs_re.replaceAll("([.]?\\*|[\"=])", "").replace('|', '_');
}
if (mergeattrib != null) {
outputfile += "-" + mergeattrib;
}
try {
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
if (!this.isWellFormed) {
throw new Exception("Malformed pipes file");
}
int tokcolumn = this.getColumn("(token|word).*");
int last_desc_column = this.getLastDescColumn();
boolean hasRoot_tag = false;
char cxml = '\0';
String line;
String tag = "", attribs = "-", inTag = "", inAttribs = "-";
//boolean closingtag = false;
char BIO = 'O';
BufferedReader xmlreader = new BufferedReader(new FileReader(tmlfile));
BufferedReader pipesreader = new BufferedReader(new FileReader(this.f));
try {
// find root tag
while (true) {
if ((cxml = (char) xmlreader.read()) == -1) {
throw new Exception("Premature end of model file");
}
if (cxml == '<') {
if ((cxml = (char) xmlreader.read()) == -1) {
throw new Exception("Premature end of model file");
}
do {
tag += cxml;
if ((cxml = (char) xmlreader.read()) == -1) {
throw new Exception("Premature end of model file");
}
} while (cxml != '>');
if (tag.equalsIgnoreCase(root_tag)) {
hasRoot_tag = true;
break;
}
tag = "";
}
//System.err.print(cxml);
}
if (!hasRoot_tag) {
throw new Exception("Root tag " + root_tag + " not found");
}
tag = "";
cxml = '\0';
while ((line = pipesreader.readLine()) != null) {
String[] linearr = line.split("\\|");
if (linearr.length >= this.pipes_desc_arr_count) {
//System.out.println(line);
String token = linearr[tokcolumn];
boolean interTokenTag = false;
boolean findtokenIter = false;
boolean delayed_closing = false;
char prevxmlchar = 'x';
char prevprevxmlchar = 'x';
for (int cn = 0; cn < token.length(); cn++) {
char cpipes = token.charAt(cn);
prevprevxmlchar = prevxmlchar;
prevxmlchar = cxml;
if ((cxml = (char) xmlreader.read()) == -1) {
throw new Exception("Premature end of model file");
}
//System.err.println("cxml(" + cxml + ") cpipes(" + cpipes + "," + cn + ") "+inTag);
if (Character.toLowerCase(cpipes) != Character.toLowerCase(cxml)) {
if (cxml == ' ' || cxml == '\n' || cxml == '\r' || cxml == '\t') {
cn--;
//System.err.println("blank found cn="+cn);
} else {
// tags handling
if (cxml == '<') {
if (cn != 0) {
interTokenTag = true;
}
cn--;
while (((cxml = (char) xmlreader.read()) != (char) -1) && cxml != '>') {
tag += cxml;
}
tag = tag.trim();
if (tag.indexOf(' ') != -1) {
attribs = tag.substring(tag.indexOf(' ') + 1);
tag = tag.substring(0, tag.indexOf(' '));
}
//System.err.println("tag=" + tag + " attribs=" + attribs);
if (tag.matches("(?i)" + elements_re) && !tag.startsWith("/")) {
findtokenIter = true;
//System.err.println("LOOKING opening tag=" + tag + " attribs=" + attribs);
if (interTokenTag) {
System.err.println("Inter-token (" + cn + ") tag consider manual tokenizing: " + token);
}
if (!inTag.equals("")) {
throw new Exception("Nested tags (" + tag + "/" + inTag + ") consider manual correction");
}
inTag = tag;
inAttribs = attribs;
tag = "";
attribs = "-";
BIO = 'B';
if (!inAttribs.matches("(?i)" + attribs_re)) {
BIO = 'O';
inTag = "";
inAttribs = "-";
findtokenIter = false;
interTokenTag = false;
}
if (mergeattrib != null) {
String tmpattrib = inAttribs.substring(inAttribs.indexOf(mergeattrib + "=")).substring(mergeattrib.length() + 1);
tmpattrib = tmpattrib.replace("\"", "");
if (tmpattrib.indexOf(' ') != -1) {
tmpattrib = tmpattrib.substring(0, tmpattrib.indexOf(' '));
}
inTag = inTag + "+" + tmpattrib;
}
/*
if (inTag.equals("EVENT")) {
inAttribs = inAttribs.substring(inAttribs.indexOf("class=")).substring(6);
inAttribs = inAttribs.replace("\"", "");
if (inAttribs.indexOf(' ') != -1) {
inAttribs = inAttribs.substring(0, inAttribs.indexOf(' '));
}
}*/
} else {
interTokenTag = false;
/*if (tag.contains("TIMEX3") && !tag.matches("/" + inTag)) {
System.err.println("problema:" + tag + " intag:" + inTag);
System.exit(1);
}*/
}
// check if closing
if (tag.matches("/.*")) {
String check = inTag;
if (mergeattrib != null && inTag.matches(".+\\+.+")) {
check = inTag.substring(0, inTag.indexOf('+'));
}
if (tag.matches("/" + "(?i)" + check)) {
if (findtokenIter) {
// safe for empty tags (events_4_instances and timex3_4_durations)
if (cn >= 0) {
System.err.println("Inter Token end of tag (" + inTag + ") cn=" + cn + " " + line);
delayed_closing = true;
} else {
BIO = 'O';
inTag = "";
inAttribs = "-";
findtokenIter = false;
interTokenTag = false;
}
} else {
//System.err.println("closing tag=" + inTag);
BIO = 'O';
inTag = "";
}
}
}
// check if end root_tag
if (tag.matches("/" + "(?i)" + root_tag)) {
System.err.println("closing root_tag=" + root_tag);
// do something
// it never reaches this because tok file ends before.
}
tag = "";
attribs = "-";
} else {
// escaped & < >
if (cxml == '&' || (prevxmlchar == '&' && cxml == 'a') || (prevprevxmlchar == ';' && prevxmlchar == ' ' && cxml == 'a')) {
cn--;
while (((cxml = (char) xmlreader.read()) != (char) -1) && cxml != ';') {
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
System.err.println("Reading XML escaped char in: " + token);
}
}
} else {
throw new Exception("Distinct chars cxml(" + cxml + ") cpipes(" + cpipes + ")");
}
}
}
}
}
//System.out.print("paired output: ");
for (int i = 0; i < linearr.length - 1; i++) {
// There are roles columns in the sentence
if (i == last_desc_column) {
outfile.write(linearr[i] + "|" + BIO);
if (BIO != 'O') { // && !inTag.equals("")
outfile.write("-" + inTag);
//System.err.println(BIO+"-" + inTag);
}
outfile.write("|" + inAttribs + "|");
if (BIO == 'B') {
BIO = 'I';
inAttribs = "-";
}
} else {
outfile.write(linearr[i] + "|");
}
}
// There arent roles columns in the sentences
if (linearr.length - 1 == last_desc_column) {
outfile.write(linearr[linearr.length - 1] + "|" + BIO);
if (BIO != 'O') { // && !inTag.equals("")
outfile.write("-" + inTag);
//System.err.println(BIO+"-" + inTag);
}
outfile.write("|" + inAttribs);
if (BIO == 'B') {
BIO = 'I';
inAttribs = "-";
}
} else {
outfile.write(linearr[linearr.length - 1]);
}
if (delayed_closing) {
BIO = 'O';
inTag = "";
inAttribs = "-";
findtokenIter = false;
interTokenTag = false;
delayed_closing = false;
}
outfile.write("\n");
} else {
if (!inTag.equals("")) {
throw new Exception("Broken tag: " + inTag + " at the end of the file/sentence");
}
outfile.write(line + "\n");
}
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (xmlreader != null) {
xmlreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
public static String IOB2check(PipesFile pipesfile) {
return IOB2check(pipesfile, pipesfile.getLastDescColumn());
}
public static String IOB2check(PipesFile pipesfile, int IOB2column) {
String outputfile = null;
try {
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
System.err.println("Cheking IOB2...");
}
outputfile = pipesfile.getFile().getCanonicalPath() + "-IOB2checked";
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile()));
try {
String pipesline;
String[] pipesarr = null;
String previousIOB2 = "O";
int linen = 0;
String previousSENT = "O";
int toknumcol = pipesfile.getColumn("tok-num");
int sentnumcol = pipesfile.getColumn("sent-num");
while ((pipesline = pipesreader.readLine()) != null) {
linen++;
pipesarr = pipesline.split("\\|");
// take into account sentences and newlines (only new format n)
if ((sentnumcol != -1 && !pipesarr[sentnumcol].equals(previousSENT)) || (sentnumcol != -1 && pipesarr[toknumcol].matches("[^-]+-[^n]*n.*"))) {
previousIOB2 = "O";
}
if (pipesarr.length > 1 && previousIOB2.equals("O") && pipesarr[IOB2column].startsWith("I-")) {
int i = 0;
for (i = 0; i < (pipesarr.length - 1); i++) {
if (i != IOB2column) {
outfile.write(pipesarr[i] + "|");
} else {
outfile.write(pipesarr[i].replaceFirst("I-", "B-") + "|");
}
}
if (i == IOB2column) {
//System.out.println(pipesarr[i].replaceFirst("I-", "B-"));
outfile.write(pipesarr[i].replaceFirst("I-", "B-"));
} else {
outfile.write(pipesarr[i]);
}
outfile.write("\n");
previousIOB2 = "B";
} else {
outfile.write(pipesline + "\n");
if (pipesarr.length > 1) {
previousIOB2 = pipesarr[IOB2column].substring(0, 1);
} else {
previousIOB2 = "O";
}
}
if (sentnumcol != -1) {
previousSENT = pipesarr[sentnumcol];
}
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
/**
* The priority is on the file of the first paramenter
*
*/
public static String merge_pipes(String primary, String secondary) {
String outputfile = null;
// basic check
if (primary == null && secondary == null) {
return null;
}
if (primary == null) {
return secondary;
}
if (secondary == null) {
return primary;
}
try {
outputfile = primary + "-merged";
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
BufferedReader extentsreader = new BufferedReader(new FileReader(primary));
BufferedReader extentsreader2 = new BufferedReader(new FileReader(secondary));
try {
String extentline;
String[] extentarr = null;
String extentline2;
String[] extentarr2 = null;
PipesFile keypipes = new PipesFile(primary);
keypipes.isWellFormedOptimist();
int iob2col1 = keypipes.getColumn("element\\(IOB2\\)");
keypipes = new PipesFile(secondary);
keypipes.isWellFormedOptimist();
int iob2col2 = keypipes.getColumn("element\\(IOB2\\)");
boolean firstO = true;
while ((extentline = extentsreader.readLine()) != null) {
extentarr = extentline.split("\\|");
if ((extentline2 = extentsreader2.readLine()) == null) {
throw new Exception("Secondary file ended prematurely.");
}
extentarr2 = extentline2.split("\\|");
if (!extentarr[iob2col1].equals("O") && !extentarr2[iob2col2].equals("O")) {
System.err.println("Error merging pipes files!! overlaping elements.\n" +extentline+"\n"+extentline2);
System.err.println("Ignoring event");
}
if (!extentarr[iob2col1].equals("O")) {
if (iob2col1 == (extentarr.length - 1)) {
outfile.write(extentline + "|-\n");
} else {
outfile.write(extentline + "\n");
}
firstO = true;
} else {
if (firstO && extentarr2[iob2col2].startsWith("I-")) {
String tmpelem = extentarr2[iob2col2].substring(2);
extentline2 = extentline2.replaceAll("\\|I-" + tmpelem, "\\|B-" + tmpelem);
}
if (iob2col2 == (extentarr2.length - 1)) {
outfile.write(extentline2 + "|-\n");
} else {
outfile.write(extentline2 + "\n");
}
firstO = false;
}
}
} finally {
if (extentsreader != null) {
extentsreader.close();
}
if (extentsreader2 != null) {
extentsreader2.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
/**
* Return column number or -1 if colname_re does not exist
*
* @param colname_re
* @return
*/
public int getColumn(String colname_re) {
try {
//System.out.println(colname_re+" "+this.pipes_desc_arr_count);
for (int i = 0; i < this.pipes_desc_arr_count; i++) {
if (this.pipes_desc_arr[i].matches(colname_re)) {
return i;
}
}
//throw new Exception("Column " + colname_re + " not found");
} catch (Exception e) {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return -1;
}
return -1;
}
public int getLastDescColumn() {
return this.pipes_desc_arr_count - 1;
}
public String saveColumnFile(String colname) {
String outputfile = this.getFile().getAbsolutePath() + "." + colname;
int col = this.getColumn(colname);
try {
BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
if (!this.isWellFormed) {
throw new Exception("Malformed pipes file");
}
BufferedReader pipesreader = new BufferedReader(new FileReader(this.f));
try {
String line;
String outputline = "";
int current_sentence = 0; // (from 0 to n)
while ((line = pipesreader.readLine()) != null) {
String[] linearr = line.split("\\|");
if (linearr.length >= this.pipes_desc_arr_count) {
outfile.write(linearr[col] + "\n");
} else {
current_sentence++;
outfile.write(outputline + "\n");
outputline = "";
}
}
// write last sentence
if (!outputline.equals("")) {
outfile.write(outputline + "\n");
}
} finally {
if (pipesreader != null) {
pipesreader.close();
}
if (outfile != null) {
outfile.close();
}
}
} catch (Exception e) {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return outputfile;
}
/**
* Builts the statistics of the NLPFile in a Stat Object which
* can be empty or filled with other stats
*
* @param st
* @param params
*/
public void fillStats(Stat st, String params) { //File statsf,
try {
BufferedReader pipesreader = new BufferedReader(new FileReader(this.f));
try {
String line;
while ((line = pipesreader.readLine()) != null) {
String[] linearr = line.split("\\|");
if (linearr.length >= this.pipes_desc_arr_count) {
st.addData(linearr);
}
}
//st.print();
} finally {
if (pipesreader != null) {
pipesreader.close();
}
//return st;
}
} catch (Exception e) {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
//return null;
}
}
@Override
public String toPlain(String filename) {
throw new UnsupportedOperationException("toPlain not applicable to this type of file");
}
}