package is2.io; import is2.data.SentenceData09; import is2.parser.Parser; import is2.util.DB; import java.io.*; import java.util.StringTokenizer; public class CONLLWriter06 extends CONLLWriter { public CONLLWriter06() { } public static void main(String args[]) throws IOException { if (args.length == 2) { File f = new File(args[0]); File f2 = new File(args[1]); BufferedReader ir = new BufferedReader(new InputStreamReader(new FileInputStream(f), "ISO-8859"), 32768); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f2), "UTF-8"));; boolean found = false; boolean tab = false; while (true) { String l = ir.readLine(); if (l == null) { break; } String x = l.trim(); if (x.endsWith("\t")) { tab = true; } br.write(x); br.newLine(); if (!l.equals(x)) { found = true; } } ir.close(); br.flush(); br.close(); if (found) { DB.println("found diff. found tab? " + tab); } } else if (args.length == 3) { File f1 = new File(args[1]); File f2 = new File(args[2]); BufferedReader ir1 = new BufferedReader(new InputStreamReader(new FileInputStream(f1), "ISO-8859"), 32768); BufferedReader ir2 = new BufferedReader(new InputStreamReader(new FileInputStream(f2), "UTF-8"), 32768); int line = 0, alltabs1 = 0, alltabs2 = 0; while (true) { String l1 = ir1.readLine(); String l2 = ir2.readLine(); if (l1 == null && l2 != null) { DB.println("files do not end at the same line "); } if (l1 != null && l2 == null) { DB.println("files do not end at the same line "); } if (l1 == null) { break; } StringTokenizer t1 = new StringTokenizer(l1, "\t"); StringTokenizer t2 = new StringTokenizer(l2, "\t"); int tabs1 = 0; while (t1.hasMoreTokens()) { t1.nextElement(); tabs1++; alltabs1++; } int tabs2 = 0; while (t2.hasMoreTokens()) { t2.nextElement(); tabs2++; alltabs2++; } line++; if (tabs1 != tabs2) { DB.println("number of tabs different in line " + line + " file1-tabs " + tabs1 + " file2-tabs " + tabs2); System.exit(0); } } DB.println("checked lines " + line + " with tabs in file 1 " + alltabs1 + " in file2 " + alltabs2); } else { File f = new File(args[0]); String[] dir = f.list(); for (String fx : dir) { BufferedReader ir = new BufferedReader(new InputStreamReader(new FileInputStream(args[0] + File.separatorChar + fx), "UTF-8"), 32768); Parser.out.println("check file " + fx); while (true) { String l = ir.readLine(); if (l == null) { break; } if (l.endsWith("\t")) { DB.println("found tab in file " + fx); break; } } ir.close(); } } } public CONLLWriter06(String file) { try { writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8")); } catch (Exception e) { e.printStackTrace(); } } public CONLLWriter06(String outfile, int formatTask) { this(outfile); } @Override public void write(SentenceData09 inst) throws IOException { for (int i = 0; i < inst.length(); i++) { writer.write(Integer.toString(i + 1)); writer.write('\t'); // id writer.write(inst.forms[i]); writer.write('\t'); // form if (inst.lemmas != null && inst.lemmas[i] != null) { writer.write(inst.lemmas[i]); } else { writer.write(DASH); // lemma } writer.write('\t'); // writer.write(DASH); // cpos // writer.write('\t'); writer.write(inst.gpos[i]); // cpos has to be included writer.write('\t'); writer.write(inst.ppos[i]); // ppos writer.write('\t'); if (inst.ofeats[i].isEmpty() || inst.ofeats[i].equals(" ")) { writer.write(DASH); } else { writer.write(inst.ofeats[i]); } writer.write('\t'); //writer.write(DASH); writer.write('\t'); // pfeat writer.write(Integer.toString(inst.heads[i])); writer.write('\t'); // head if (inst.labels[i] != null) { writer.write(inst.labels[i]); // rel } else { writer.write(DASH); } writer.write('\t'); writer.write(DASH); writer.write('\t'); writer.write(DASH); writer.write('\t'); writer.newLine(); } writer.newLine(); } }