package is2.io; import is2.data.SentenceData09; import is2.parser.Parser; import is2.util.DB; import java.io.*; import java.util.StringTokenizer; public class CONLLWriter09 extends CONLLWriter { int format = 0; public static final boolean NO_ROOT = true, ROOT = false; public CONLLWriter09() { } public static void main(String args[]) throws IOException { if (args.length == 2) { File f = new File(args[0]); File f2 = new File(args[1]); BufferedReader ir = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"), 32768); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f2), "UTF8"));; boolean found = false; boolean tab = false; while (true) { String l = ir.readLine(); if (l == null) { break; } String x = l.trim(); if (x.endsWith("\t")) { tab = true; } br.write(x); br.newLine(); if (!l.equals(x)) { found = true; } } ir.close(); br.flush(); br.close(); if (found) { DB.println("found diff. found tab? " + tab); } } else if (args.length == 3) { File f1 = new File(args[1]); File f2 = new File(args[2]); BufferedReader ir1 = new BufferedReader(new InputStreamReader(new FileInputStream(f1), "UTF-8"), 32768); BufferedReader ir2 = new BufferedReader(new InputStreamReader(new FileInputStream(f2), "UTF-8"), 32768); int line = 0, alltabs1 = 0, alltabs2 = 0; while (true) { String l1 = ir1.readLine(); String l2 = ir2.readLine(); if (l1 == null && l2 != null) { DB.println("files do not end at the same line "); } if (l1 != null && l2 == null) { DB.println("files do not end at the same line "); } if (l1 == null) { break; } StringTokenizer t1 = new StringTokenizer(l1, "\t"); StringTokenizer t2 = new StringTokenizer(l2, "\t"); int tabs1 = 0; while (t1.hasMoreTokens()) { t1.nextElement(); tabs1++; alltabs1++; } int tabs2 = 0; while (t2.hasMoreTokens()) { t2.nextElement(); tabs2++; alltabs2++; } line++; if (tabs1 != tabs2) { DB.println("number of tabs different in line " + line + " file1-tabs " + tabs1 + " file2-tabs " + tabs2); System.exit(0); } } DB.println("checked lines " + line + " with tabs in file 1 " + alltabs1 + " in file2 " + alltabs2); } else { File f = new File(args[0]); String[] dir = f.list(); for (String fx : dir) { BufferedReader ir = new BufferedReader(new InputStreamReader(new FileInputStream(args[0] + File.separatorChar + fx), "UTF-8"), 32768); Parser.out.println("check file " + fx); while (true) { String l = ir.readLine(); if (l == null) { break; } if (l.endsWith("\t")) { DB.println("found tab in file " + fx); break; } } ir.close(); } } } public CONLLWriter09(String file) { try { writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF8")); } catch (Exception e) { e.printStackTrace(); } } public CONLLWriter09(Writer writer) { this.writer = new BufferedWriter(writer); } public CONLLWriter09(String outfile, int formatTask) { this(outfile); } @Override public void write(SentenceData09 inst) throws IOException { write(inst, ROOT); } /** * * @param inst * @param root true: remove root node * @throws IOException */ public void write(SentenceData09 inst, boolean root) throws IOException { int i, mod; if (root && (inst.forms[0].startsWith("<root") || (inst.lemmas[0] != null && inst.lemmas[0].startsWith("<root")))) { i = 1; mod = 0; } else { i = 0; mod = 1; } //=()?1:0; if (format == CONLLWriter09.F_ONE_LINE) { boolean first = true; for (; i < inst.length(); i++) { if (first) { first = false; } else { writer.write(" "); } writer.write(inst.plemmas[i]); } writer.newLine(); return; } for (; i < inst.length(); i++) { if (inst.id == null || inst.id[i] == null) { writer.write(Integer.toString(i + mod)); writer.write('\t'); } // id else { writer.write(inst.id[i]); writer.write('\t'); } writer.write(inst.forms[i]); writer.write('\t'); // form if (inst.lemmas != null && inst.lemmas[i] != null) { writer.write(inst.lemmas[i]); } else { writer.write(DASH); // lemma } writer.write('\t'); if (inst.plemmas != null && inst.plemmas[i] != null) { writer.write(inst.plemmas[i]); } else { writer.write(DASH); // plemma } writer.write('\t'); if (inst.gpos[i] != null) { writer.write(inst.gpos[i]); // gpos } else { writer.write(DASH); } writer.write('\t'); if (inst.ppos != null && inst.ppos[i] != null) { writer.write(inst.ppos[i]); } else { writer.write(DASH); // ppos } writer.write('\t'); if (inst.ofeats != null && inst.ofeats[i] != null) { writer.write(inst.ofeats[i]); } else { writer.write(DASH); } writer.write('\t'); //writer.write(DASH); writer.write('\t'); // feat if (inst.pfeats != null && inst.pfeats[i] != null) { //Parser.out.println(""+inst.pfeats[i]); writer.write(inst.pfeats[i]); } else { writer.write(DASH); } writer.write('\t'); writer.write(Integer.toString(inst.heads[i])); writer.write('\t'); // head if (inst.pheads != null) { writer.write(Integer.toString(inst.pheads[i])); } else { writer.write(DASH); } writer.write('\t'); // phead if (inst.labels[i] != null) { writer.write(inst.labels[i]); // rel } else { writer.write(DASH); } writer.write('\t'); if (inst.plabels != null && inst.plabels[i] != null) { writer.write(inst.plabels[i]); // rel } else { writer.write(DASH); } writer.write('\t'); if (inst.fillp != null && inst.fillp[i] != null) { writer.write(inst.fillp[i]); // fill p } else { writer.write(DASH); } // writer.write('\t'); if (inst.sem == null) { writer.write('\t'); writer.write(DASH); } else { boolean foundPred = false; // print the predicate for (int p = 0; p < inst.sem.length; p++) { if (inst.semposition[p] == i) { foundPred = true; // Parser.out.println("write pred "+inst.sem[p] ); writer.write('\t'); writer.write(inst.sem[p]); // if (inst.sem[p].startsWith(".")) DB.println("error "+inst.sem[p]); } } if (!foundPred) { writer.write('\t'); writer.write(DASH); // writer.write('\t'); // writer.write(DASH); } // print the arguments for (int p = 0; p < inst.sem.length; p++) { boolean found = false; if (inst.arg != null && inst.arg.length > p && inst.arg[p] != null) { for (int a = 0; a < inst.arg[p].length; a++) { if (i == inst.argposition[p][a]) { writer.write('\t'); writer.write(inst.arg[p][a]); found = true; break; } } } if (!found) { writer.write('\t'); writer.write(DASH); } } } writer.newLine(); } writer.newLine(); } /** * Sets the output format such as CoNLL or one line for the lemmata of the * sentence (see F_xxxx constants). * * @param formatTask */ public void setOutputFormat(int formatTask) { format = formatTask; } }