package is2.util; import java.io.*; import java.util.StringTokenizer; /** * @author Dr. Bernd Bohnet, 17.01.2010 * * This class removes all information from a conll 2009 file except of columns 1 * and 2 that contain the word id and the word form. */ public class ConvertTiger2CoNLL { public static void main(String[] args) throws IOException { OptionsSuper options = new OptionsSuper(args, null); if (options.trainfile != null) { System.err.println("included sentences " + clean(options.trainfile, options.outfile, options.start, options.count)); } else { System.err.println("Please proivde the file name -train <file-name>"); } } /** * @param trainfile * @throws IOException */ private static int clean(String file, String outFile, int start, int numberOfSentences) throws IOException { System.err.println("writting to " + outFile); System.err.println("start " + start + " to " + (start + numberOfSentences)); int state = 0; BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"), 32768); BufferedWriter writer = new BufferedWriter(new java.io.OutputStreamWriter(new java.io.FileOutputStream(outFile), "UTF-8"), 32768); String l; try { int id = 1, snt = 0, cnt = 0; while ((l = reader.readLine()) != null) { if (l.startsWith("#BOS")) { state = 1; //BOS id = 1; snt++; continue; } if (l.startsWith("#EOS") && state == 1) { state = 2; //BOS cnt++; writer.newLine(); } if (start > snt || (start + numberOfSentences) <= snt) { state = 3; } if (l.startsWith("#5") || l.startsWith("#6") || l.startsWith("#7")) { continue; } if ((start + numberOfSentences) <= snt) { break; } if (state == 3) { continue; } if (state == 1) { l = l.replace("\t\t", "\t"); l = l.replace("\t\t", "\t"); StringTokenizer t = new StringTokenizer(l, "\t"); int count = 0; writer.write("" + id + "\t"); while (t.hasMoreTokens()) { if (count == 0) { writer.write(t.nextToken() + "\t"); } else if (count == 1) { writer.write(t.nextToken() + "\t_\t"); } else if (count == 2) { writer.write(t.nextToken() + "\t_\t"); } else if (count == 3) { writer.write(t.nextToken().replace(".", "|") + "\t_\t"); } else { t.nextToken(); } count++; } writer.write("_\t_\t_\t_\t_\t_\t_\t_\t_"); writer.newLine(); } id++; } writer.flush(); writer.close(); reader.close(); return cnt; } catch (IOException e) { e.printStackTrace(); } return -1; } }