package edu.stanford.nlp.international.french.pipeline; import edu.stanford.nlp.util.logging.Redwood; import java.io.*; import java.util.ArrayList; import java.util.List; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeReader; import edu.stanford.nlp.trees.TreeReaderFactory; import edu.stanford.nlp.trees.TreeTransformer; import edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory; import edu.stanford.nlp.trees.tregex.TregexParseException; import edu.stanford.nlp.trees.tregex.TregexMatcher; import edu.stanford.nlp.trees.tregex.TregexPattern; import edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon; import edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern; import edu.stanford.nlp.util.Pair; /** * Makes FTB trees consistent with FrenchTreebankLanguagePack. Specifically, it removes * sentence-initial punctuation, and constraints sentence-final punctuation to be one of * [.!?]. * <p> * Also discards two trees of the form (SENT .), which appear in the Candito training * set. * * @author Spence Green * */ public class FTBCorrector implements TreeTransformer { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(FTBCorrector.class); private static final boolean DEBUG = false; private final List<Pair<TregexPattern,TsurgeonPattern>> ops; public FTBCorrector() { ops = loadOps(); } private List<Pair<TregexPattern, TsurgeonPattern>> loadOps() { List<Pair<TregexPattern,TsurgeonPattern>> ops = new ArrayList<>(); String line = null; try { BufferedReader br = new BufferedReader(new StringReader(editStr)); List<TsurgeonPattern> tsp = new ArrayList<>(); while ((line = br.readLine()) != null) { if (DEBUG) log.info("Pattern is " + line); TregexPattern matchPattern = TregexPattern.compile(line); if (DEBUG) log.info(" [" + matchPattern + "]"); tsp.clear(); while (continuing(line = br.readLine())) { TsurgeonPattern p = Tsurgeon.parseOperation(line); if (DEBUG) log.info("Operation is " + line + " [" + p + "]"); tsp.add(p); } if ( ! tsp.isEmpty()) { TsurgeonPattern tp = Tsurgeon.collectOperations(tsp); ops.add(new Pair<>(matchPattern, tp)); } } // while not at end of file } catch (IOException ioe) { ioe.printStackTrace(); } return ops; } private static boolean continuing(String str) { return str != null && ! str.matches("\\s*"); } public Tree transformTree(Tree t) { return Tsurgeon.processPatternsOnTree(ops, t); } /** * The Tsurgeon patterns */ private static final String editStr = //Delete sentence-initial punctuation ("@PUNC=punc <: __ >, @SENT\n" + "delete punc\n" + "\n") + //Delete sentence final punctuation that is preceded by punctuation (first time) ("@PUNC=punc <: __ >>- @SENT $, @PUNC\n" + "delete punc\n" + "\n") + //Delete sentence final punctuation that is preceded by punctuation (second time) ("@PUNC=punc <: __ >>- @SENT $, @PUNC\n" + "delete punc\n" + "\n") + //Convert remaining sentence-final punctuation to either . if it is not [.!?] ("@PUNC <: /^[^!\\.\\?]$/=term >>- @SENT !$, @PUNC\n" + "relabel term /./\n" + "\n") + //Delete medial, sentence-final punctuation ("@PUNC=punc <: (/^[!\\.\\?]$/ . __)\n" + "delete punc\n" + "\n") + //Now move the sentence-final mark under SENT ("@PUNC=punc <: /^[\\.!\\?]$/ >>- (@SENT <- __=sfpos) !> @SENT\n" + "move punc $- sfpos\n" + "\n") + //For those trees that lack a sentence-final punc, add one. ("!@PUNC <: /^[^\\.!\\?]$/ >>- (@SENT <- __=loc)\n" + "insert (PUNC .) $- loc\n" + "\n") + //Finally, delete these punctuation marks, which I can't seem to kill otherwise... ("@PUNC <: /^[\\.!\\?]+$/=punc . (@PUNC <: /[\\.!\\?]/)\n" + "prune punc\n" + "\n") + //A bad MWADV tree in the training set ("@NP=bad > @MWADV\n" + "excise bad bad\n" + "\n") + // Not sure why this got a label of X. Similar trees suggest it // should be A instead ("X=bad < demi\n" + "relabel bad A\n" + "\n") + // This also seems to be mislabeled ("PC=pc < D'|depuis|après\n" + "relabel pc P\n" + "\n"); /** * @param args */ public static void main(String[] args) { if(args.length != 1) { log.info("Usage: java " + FTBCorrector.class.getName() + " filename\n"); System.exit(-1); } TreeTransformer tt = new FTBCorrector(); File f = new File(args[0]); try { //These bad trees in the Candito training set should be thrown out: // (ROOT (SENT (" ") (. .))) // (ROOT (SENT (. .))) TregexPattern pBadTree = TregexPattern.compile("@SENT <: @PUNC"); TregexPattern pBadTree2 = TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __"); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); TreeReaderFactory trf = new FrenchTreeReaderFactory(); TreeReader tr = trf.newTreeReader(br); int nTrees = 0; for(Tree t; (t = tr.readTree()) != null;nTrees++) { TregexMatcher m = pBadTree.matcher(t); TregexMatcher m2 = pBadTree2.matcher(t); if(m.find() || m2.find()) { log.info("Discarding tree: " + t.toString()); } else { Tree fixedT = tt.transformTree(t); System.out.println(fixedT.toString()); } } tr.close(); System.err.printf("Wrote %d trees%n",nTrees); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (TregexParseException e) { e.printStackTrace(); } } }