package edu.stanford.nlp.parser.ensemble.utils; import edu.stanford.nlp.process.Morphology; import java.io.BufferedReader; import java.io.FileOutputStream; import java.io.FileReader; import java.io.PrintStream; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; public class CleanUpStanford { public static final int WORD = 1; public static final int LEMMA = 2; public static final int POS = 3; public static final int LABEL = 7; public static final int HEAD = 6; public static void main(String[] args) throws Exception { Morphology morpha = new Morphology(); BufferedReader is = new BufferedReader(new FileReader(args[0])); PrintStream os = new PrintStream(new FileOutputStream(args[1])); String line; ArrayList<String[]> sentence = new ArrayList<String[]>(); while ((line = is.readLine()) != null) { if (line.trim().length() == 0) { analyze(sentence); print(os, sentence); os.println(); sentence = new ArrayList<String[]>(); continue; } String[] toks = line.split("[\t]+"); //System.out.println(line); // convert all dep labels to upper case toks[LABEL] = toks[LABEL].toUpperCase(); // convert NULL to ROOT if (toks[LABEL].equalsIgnoreCase("NULL")) { toks[LABEL] = "ROOT"; } // generate lemmas toks[LEMMA] = morpha.stem(toks[WORD], toks[POS]).word(); sentence.add(toks); /* * for(int i = 0; i < toks.length; i ++){ if(i > 0) os.print("\t"); * os.print(toks[i]); } os.println(); */ } is.close(); os.close(); } private static void analyze(ArrayList<String[]> sent) { // check if there is a ROOT int foundRoot = 0; for (int i = 0; i < sent.size(); i++) { String[] toks = sent.get(i); if (Integer.parseInt(toks[HEAD]) == 0 && toks[LABEL].equals("ROOT")) { foundRoot++; } } if (foundRoot == 0) { System.err.println("Found sentence without ROOT!"); print(System.err, sent); } else if (foundRoot > 1) { System.err.println("Found sentence with multiple ROOTs!"); print(System.err, sent); } // check if there are cycles boolean foundCycle = false; for (int i = 0; i < sent.size(); i++) { if (hasCycles(sent, i)) { foundCycle = true; break; } } if (foundCycle) { System.err.println("Found sentence with cycles!"); print(System.err, sent); } } private static boolean hasCycles(ArrayList<String[]> sent, int start) { Set<Integer> seen = new HashSet<Integer>(); List<Integer> seq = new ArrayList<Integer>(); for (int crt = start; crt >= 0; crt = Integer.parseInt(sent.get(crt)[HEAD]) - 1) { seq.add(crt); if (seen.contains(crt)) { System.err.print("CYCLE:"); for (Integer i : seq) { System.err.print(" " + i); } System.err.println(); return true; } seen.add(crt); } return false; } private static void print(PrintStream os, ArrayList<String[]> sent) { for (int j = 0; j < sent.size(); j++) { String[] toks = sent.get(j); for (int i = 0; i < toks.length; i++) { if (i > 0) { os.print("\t"); } os.print(toks[i]); } os.println(); } } }