package is2.util; import is2.data.SentenceData09; import is2.io.*; import is2.parser.Parser; import java.io.*; import java.nio.charset.Charset; import java.util.ArrayList; /** * @author Dr. Bernd Bohnet, 01.03.2010 * * */ public class Convert { public static void main(String args[]) throws Exception { if (args.length < 2) { Parser.out.println("Usage"); Parser.out.println(" java is2.util.Convert <in> <out> [-w06|-w0809|-yue] [-wordsonly]"); } int todo = 9; boolean wordsOnly = false; for (String a : args) { if (a != null && a.equals("-w06")) { todo = 6; } else if (a != null && a.equals("-w0809")) { todo = 89; } else if (a != null && a.equals("-yue")) { todo = 99; } else if (a != null && a.equals("-utf8")) { todo = 8; } if (a != null && a.equals("-wordsonly")) { wordsOnly = true; } } if (todo == 9) { convert(args[0], args[1]); } else if (todo == 6) { convert0906(args[0], args[1]); } else if (todo == 8) { convert8(args[0], args[1], args[2]); } else if (todo == 89) { convert0809(args[0], args[1]); } else if (todo == 99) { convertChnYue(args[0], args[1], wordsOnly); } } private static void convert8(String infile, String outfile, String format) { try { Parser.out.println("availableCharsets: " + Charset.availableCharsets()); BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(infile), format)); BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), "UTF8")); int ch; int count = 0, wcount = 0; while ((ch = in.read()) > -1) { count++; if (Character.isDefined(ch)) { out.write(ch); wcount++; } } in.close(); out.close(); Parser.out.println("read " + count + " chars and wrote " + wcount + " utf8 chars"); } catch (Exception e) { e.printStackTrace(); } } public static void convert(String source, String target) throws Exception { CONLLReader06 reader = new CONLLReader06(source); CONLLWriter09 writer = new CONLLWriter09(target); int str = 0; while (true) { SentenceData09 i = reader.getNext(); str++; if (i == null) { break; } String[] formsNoRoot = new String[i.length() - 1]; String[] posNoRoot = new String[formsNoRoot.length]; String[] lemmas = new String[formsNoRoot.length]; String[] org_lemmas = new String[formsNoRoot.length]; String[] of = new String[formsNoRoot.length]; String[] pf = new String[formsNoRoot.length]; String[] pposs = new String[formsNoRoot.length]; String[] labels = new String[formsNoRoot.length]; String[] fillp = new String[formsNoRoot.length]; int[] heads = new int[formsNoRoot.length]; for (int j = 0; j < formsNoRoot.length; j++) { formsNoRoot[j] = i.forms[j + 1]; if (formsNoRoot[j].length() == 0 || formsNoRoot[j].equals("")) { Parser.out.println("error forms " + str); // System.exit(0); formsNoRoot[j] = " "; } posNoRoot[j] = i.gpos[j + 1]; if (posNoRoot[j].length() == 0 || posNoRoot[j].equals(" ")) { Parser.out.println("error pos " + str); // System.exit(0); } pposs[j] = i.ppos[j + 1]; if (pposs[j].length() == 0 || pposs[j].equals(" ")) { Parser.out.println("error pos " + str); //System.exit(0); } labels[j] = i.labels[j + 1]; if (labels[j].length() == 0 || labels[j].equals(" ")) { Parser.out.println("error lab " + str); // System.exit(0); } heads[j] = i.heads[j + 1]; if (heads[j] > posNoRoot.length) { Parser.out.println("head out of range " + heads[j] + " " + heads.length + " " + str); heads[j] = posNoRoot.length; } lemmas[j] = i.plemmas[j + 1]; if (lemmas[j].length() == 0 || lemmas[j].equals(" ")) { Parser.out.println("error lab " + str); // System.exit(0); } org_lemmas[j] = i.lemmas[j + 1]; if (org_lemmas[j].length() == 0 || org_lemmas[j].equals(" ")) { Parser.out.println("error lab " + str); // System.exit(0); } of[j] = i.ofeats[j + 1]; pf[j] = i.pfeats[j + 1]; if (str == 6099) { // Parser.out.println(formsNoRoot[j]+"\t"+posNoRoot[j]+"\t"+pposs[j]+"\t"+labels[j]+"\t"+heads[j]); } // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; } SentenceData09 i09 = new SentenceData09(formsNoRoot, formsNoRoot, formsNoRoot, pposs, pposs, labels, heads, fillp, of, pf); //public SentenceData09(String[] forms, String[] lemmas, String[] olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, String[] fillpred) { //SentenceData09 // SentenceData09 i2 = new SentenceData09(i.forms, i.lemmas,i.org_lemmas,); writer.write(i09); } writer.finishWriting(); } public static void convertChnYue(String source, String target, boolean wordsOnly) throws Exception { BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(source), "UTF-8"), 32768); CONLLWriter09 writer = new CONLLWriter09(target); int str = 0; while (true) { ArrayList<String[]> lines = new ArrayList<>(); String line; while ((line = reader.readLine()) != null) { if (line.length() < 2) { break; } String split[] = line.split("\t"); lines.add(split); } if (line == null) { break; } str++; String[] formsNoRoot = new String[lines.size()]; String[] posNoRoot = new String[formsNoRoot.length]; String[] lemmas = new String[formsNoRoot.length]; String[] org_lemmas = new String[formsNoRoot.length]; String[] of = new String[formsNoRoot.length]; String[] pf = new String[formsNoRoot.length]; String[] pposs = new String[formsNoRoot.length]; String[] labels = new String[formsNoRoot.length]; String[] fillp = new String[formsNoRoot.length]; int[] heads = new int[formsNoRoot.length]; for (int j = 0; j < formsNoRoot.length; j++) { formsNoRoot[j] = lines.get(j)[0]; if (formsNoRoot[j].length() == 0 || formsNoRoot[j].equals("")) { Parser.out.println("error forms " + str); // System.exit(0); formsNoRoot[j] = "_"; } posNoRoot[j] = lines.get(j)[1]; if (posNoRoot[j].length() == 0 || posNoRoot[j].equals(" ")) { Parser.out.println("error pos " + str); // System.exit(0); } pposs[j] = "_"; labels[j] = lines.get(j)[3]; if (labels[j].length() == 0 || labels[j].equals(" ")) { Parser.out.println("error lab " + str); labels[j] = "_"; // System.exit(0); } heads[j] = Integer.parseInt(lines.get(j)[2]) + 1; if (heads[j] > posNoRoot.length) { Parser.out.println("head out of range " + heads[j] + " " + heads.length + " " + str); heads[j] = posNoRoot.length; } // 0 is root and not -1 if (heads[j] == -1) { heads[j] = 0; } lemmas[j] = "_"; org_lemmas[j] = "_"; of[j] = "_"; pf[j] = "_"; if (wordsOnly) { posNoRoot[j] = "_"; heads[j] = 0; labels[j] = "_"; } // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; } SentenceData09 i09 = new SentenceData09(formsNoRoot, lemmas, org_lemmas, posNoRoot, posNoRoot, labels, heads, fillp, of, pf); //public SentenceData09(String[] forms, String[] lemmas, String[] olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, String[] fillpred) { //SentenceData09 // SentenceData09 i2 = new SentenceData09(i.forms, i.lemmas,i.org_lemmas,); writer.write(i09); } writer.finishWriting(); } /** * Convert the 0 * * @param source * @param target * @throws Exception */ public static void convert0809(String source, String target) throws Exception { CONLLReader08 reader = new CONLLReader08(source); CONLLWriter09 writer = new CONLLWriter09(target); int str = 0; while (true) { SentenceData09 i = reader.getNext(); str++; if (i == null) { break; } String[] formsNoRoot = new String[i.length() - 1]; String[] posNoRoot = new String[formsNoRoot.length]; String[] lemmas = new String[formsNoRoot.length]; String[] org_lemmas = new String[formsNoRoot.length]; String[] of = new String[formsNoRoot.length]; String[] pf = new String[formsNoRoot.length]; String[] pposs = new String[formsNoRoot.length]; String[] labels = new String[formsNoRoot.length]; String[] fillp = new String[formsNoRoot.length]; int[] heads = new int[formsNoRoot.length]; for (int j = 0; j < formsNoRoot.length; j++) { formsNoRoot[j] = i.forms[j + 1]; if (formsNoRoot[j].length() == 0 || formsNoRoot[j].equals("")) { Parser.out.println("error forms " + str); // System.exit(0); formsNoRoot[j] = " "; } posNoRoot[j] = i.gpos[j + 1]; if (posNoRoot[j].length() == 0 || posNoRoot[j].equals(" ")) { Parser.out.println("error pos " + str); // System.exit(0); } pposs[j] = i.ppos[j + 1]; if (pposs[j].length() == 0 || pposs[j].equals(" ")) { Parser.out.println("error pos " + str); //System.exit(0); } labels[j] = i.labels[j + 1]; if (labels[j].length() == 0 || labels[j].equals(" ")) { Parser.out.println("error lab " + str); // System.exit(0); } heads[j] = i.heads[j + 1]; if (heads[j] > posNoRoot.length) { Parser.out.println("head out of range " + heads[j] + " " + heads.length + " " + str); heads[j] = posNoRoot.length; } lemmas[j] = i.plemmas[j + 1]; if (lemmas[j].length() == 0 || lemmas[j].equals(" ")) { Parser.out.println("error lab " + str); // System.exit(0); } org_lemmas[j] = i.lemmas[j + 1]; // if (org_lemmas[j].length()==0 ||org_lemmas[j].equals(" ")) { // Parser.out.println("error lab "+str); // // System.exit(0); // } // of[j] = i.ofeats[j+1]; // pf[j] = i.pfeats[j+1]; if (str == 6099) { // Parser.out.println(formsNoRoot[j]+"\t"+posNoRoot[j]+"\t"+pposs[j]+"\t"+labels[j]+"\t"+heads[j]); } // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; } SentenceData09 i09 = new SentenceData09(formsNoRoot, org_lemmas, lemmas, pposs, pposs, labels, heads, fillp, of, pf); //public SentenceData09(String[] forms, String[] lemmas, String[] olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, String[] fillpred) { //SentenceData09 // SentenceData09 i2 = new SentenceData09(i.forms, i.lemmas,i.org_lemmas,); writer.write(i09); } writer.finishWriting(); } public static void convert0906(String source, String target) throws Exception { CONLLReader09 reader = new CONLLReader09(source); CONLLWriter06 writer = new CONLLWriter06(target); while (true) { SentenceData09 i = reader.getNext(); if (i == null) { break; } String[] formsNoRoot = new String[i.length() - 1]; String[] posNoRoot = new String[formsNoRoot.length]; String[] lemmas = new String[formsNoRoot.length]; String[] org_lemmas = new String[formsNoRoot.length]; String[] of = new String[formsNoRoot.length]; String[] pf = new String[formsNoRoot.length]; String[] pposs = new String[formsNoRoot.length]; String[] labels = new String[formsNoRoot.length]; String[] fillp = new String[formsNoRoot.length]; int[] heads = new int[formsNoRoot.length]; for (int j = 0; j < formsNoRoot.length; j++) { formsNoRoot[j] = i.forms[j + 1]; posNoRoot[j] = i.gpos[j + 1]; pposs[j] = i.gpos[j + 1]; labels[j] = i.labels[j + 1]; heads[j] = i.heads[j + 1]; lemmas[j] = i.plemmas[j + 1]; org_lemmas[j] = i.lemmas[j + 1]; of[j] = i.ofeats[j + 1]; pf[j] = i.pfeats[j + 1]; // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; } SentenceData09 i09 = new SentenceData09(formsNoRoot, lemmas, org_lemmas, posNoRoot, pposs, labels, heads, fillp, of, pf); //public SentenceData09(String[] forms, String[] lemmas, String[] olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, String[] fillpred) { //SentenceData09 // SentenceData09 i2 = new SentenceData09(i.forms, i.lemmas,i.org_lemmas,); writer.write(i09); } writer.finishWriting(); } }