/** * */ package edu.berkeley.nlp.conll; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.LinkedList; import java.util.List; import edu.berkeley.nlp.PCFGLA.Option; import edu.berkeley.nlp.PCFGLA.OptionParser; import edu.berkeley.nlp.PCFGLA.GrammarTrainer.Options; import edu.berkeley.nlp.syntax.Tree; /** * @author petrov * */ public class DepToPTB { public static class Options { @Option(name = "-in", required = true, usage = "Input File for Grammar (Required)") public String inFileName; @Option(name = "-finePOStags", usage = "Use fine POS tags (Default: false=coarse") public boolean useFinePOS = false; } public static void main(String[] args) { // String[] sentence = { // "1 The _ DT DT _ 4 NMOD _ _\n", // "2 luxury _ NN NN _ 4 NMOD _ _\n", // "3 auto _ NN NN _ 4 NMOD _ _\n", // "4 maker _ NN NN _ 7 SBJ _ _\n", // "5 last _ JJ JJ _ 6 NMOD _ _\n", // "6 year _ NN NN _ 7 VMOD _ _\n", // "7 sold _ VB VBD _ 0 ROOT _ _\n", // "8 1,214 _ CD CD _ 9 NMOD _ _\n", // "9 cars _ NN NNS _ 7 OBJ _ _\n", // "10 in _ IN IN _ 7 ADV _ _\n", // "11 the _ DT DT _ 12 NMOD _ _\n", // "12 U.S. _ NN NNP _ 10 PMOD _ _\n"}; OptionParser optParser = new OptionParser(Options.class); Options opts = (Options) optParser.parse(args, true); // provide feedback on command-line arguments // System.out.println("Calling with " + optParser.getPassedInOptions()); BufferedReader input = null; String fileName = opts.inFileName; try { input = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), Charset.forName("UTF-8")));//GB18030"))); String line = ""; List<String> sentence = new ArrayList<String>(); while ((line = input.readLine()) != null) { System.out.println(line); if (line.equals("")){ Tree<String> tree = turnIntoTree(sentence, opts.useFinePOS); System.out.println("( "+tree+")"); sentence = new LinkedList<String>(); } else sentence.add(line); } } catch (Exception ex) { ex.printStackTrace(); } } /** * @param sentence * @return */ private static Tree<String> turnIntoTree(List<String> sentence, boolean useFinePOS) { int posIndex = (useFinePOS) ? 4 : 3; int nWords = sentence.size(); Tree[] trees = new Tree[nWords]; List<Integer>[] childIndices = new List[nWords]; int[] freeKids = new int[nWords]; int[] parentIndices = new int[nWords]; int rootIndex = -1; for (int i=0; i<nWords; i++){ childIndices[i] = new LinkedList<Integer>(); } for (int i=0; i<nWords; i++){ String[] fields = sentence.get(i).split("\t"); String word = fields[1]; if (word.equals("(")||word.equals(")")) word = "LRB"; Tree<String> child = new Tree<String>(word); List<Tree<String>> childList = new ArrayList<Tree<String>>(1); childList.add(child); String tag = fields[posIndex]; if (tag.equals("(")||tag.equals(")")) tag = "LRB"; trees[i] = new Tree<String>(tag, childList); int pIndex = Integer.parseInt(fields[6])-1; parentIndices[i] = pIndex; if (pIndex==-1) rootIndex=i; else childIndices[pIndex].add(i); childIndices[i].add(i); } if (nWords == 1) return trees[0]; for (int i=0; i<nWords; i++){ for (Integer c : childIndices[i]){ if (childIndices[c].size()==1) freeKids[i]++; } freeKids[i]++; // because each tree is also its own child } while (childIndices[rootIndex].size()>0){ for (int i=0; i<nWords; i++){ if (childIndices[i].size()<=1) continue; if (freeKids[i]==childIndices[i].size()){ // all its children are free -> attach them List<Tree<String>> childList = new ArrayList<Tree<String>>(); for (Integer c : childIndices[i]){ childList.add(trees[c]); } Tree<String> newTree = new Tree<String>(trees[i].getLabel()+"*",childList); trees[i] = newTree; if (parentIndices[i]>=0) freeKids[parentIndices[i]]++; childIndices[i] = new LinkedList(); } } } return trees[rootIndex]; } }