/** * Changes the order of a corpus to right-to-left. This is needed to build * right-to-left shift-reduce models. */ package edu.stanford.nlp.parser.ensemble.utils; import java.io.*; import java.util.ArrayList; import java.util.List; import java.util.zip.GZIPInputStream; public class ReverseCorpus { static final int INDEX_POS = 0; static final int HEAD_POS = 6; public static void main(String[] args) throws Exception { reverseCorpus(args[0], args[0] + ".reversed"); } public static void reverseCorpus(String in, String out) throws IOException { InputStream fis = new FileInputStream(in); if (in.endsWith(".gz")) { fis = new GZIPInputStream(fis); } BufferedReader is = new BufferedReader(new InputStreamReader(fis)); PrintStream os = new PrintStream(new FileOutputStream(out)); List<String[]> sent = new ArrayList<String[]>(); String line; while ((line = is.readLine()) != null) { if (line.trim().length() == 0) { List<String[]> rev = reverseSentence(sent); for (String[] toks : rev) { for (int i = 0; i < toks.length; i++) { if (i > 0) { os.print("\t"); } os.print(toks[i]); } os.println(); } os.println(); sent = new ArrayList<String[]>(); } else { String[] toks = line.split("[\t]+"); sent.add(toks); } } // some files do not end with an empty line... if (sent.size() > 0) { List<String[]> rev = reverseSentence(sent); for (String[] toks : rev) { for (int i = 0; i < toks.length; i++) { if (i > 0) { os.print("\t"); } os.print(toks[i]); } os.println(); } } is.close(); os.close(); } static List<String[]> reverseSentence(List<String[]> sent) { List<String[]> rev = new ArrayList<String[]>(); int len = sent.size(); for (int i = sent.size() - 1; i >= 0; i--) { String[] toks = sent.get(i); toks[INDEX_POS] = Integer.toString(len + 1 - Integer.parseInt(toks[INDEX_POS])); if (toks.length > HEAD_POS && !toks[HEAD_POS].equals("_") && !toks[HEAD_POS].equals("-")) { int oldHead = Integer.parseInt(toks[HEAD_POS]); if (oldHead != 0) { toks[HEAD_POS] = Integer.toString(len + 1 - oldHead); } } rev.add(toks); } return rev; } }