package edu.stanford.nlp.international.arabic.pipeline;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeVisitor;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon;
import edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern;
import edu.stanford.nlp.util.Pair;
/**
* Converts VP < PP-CLR construction to MWV < MWP
*
* @author Spence Green
*
*/
public class MWETreeVisitor implements TreeVisitor {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(MWETreeVisitor.class);
private static final boolean DEBUG = false;
private final List<Pair<TregexPattern,TsurgeonPattern>> ops;
public MWETreeVisitor() {
ops = loadOps();
}
private List<Pair<TregexPattern, TsurgeonPattern>> loadOps() {
List<Pair<TregexPattern,TsurgeonPattern>> ops = new ArrayList<>();
String line = null;
try {
BufferedReader br = new BufferedReader(new StringReader(editStr));
List<TsurgeonPattern> tsp = new ArrayList<>();
while ((line = br.readLine()) != null) {
if (DEBUG) log.info("Pattern is " + line);
TregexPattern matchPattern = TregexPattern.compile(line);
if (DEBUG) log.info(" [" + matchPattern + "]");
tsp.clear();
while (continuing(line = br.readLine())) {
TsurgeonPattern p = Tsurgeon.parseOperation(line);
if (DEBUG) log.info("Operation is " + line + " [" + p + "]");
tsp.add(p);
}
if ( ! tsp.isEmpty()) {
TsurgeonPattern tp = Tsurgeon.collectOperations(tsp);
ops.add(new Pair<>(matchPattern, tp));
}
} // while not at end of file
} catch (IOException ioe) {
ioe.printStackTrace();
}
return ops;
}
private static boolean continuing(String str) {
return str != null && ! str.matches("\\s*");
}
public void visitTree(Tree t) {
Tsurgeon.processPatternsOnTree(ops, t);
}
/**
* The Tsurgeon patterns
*/
private static final String editStr =
//Mark MWEs
("@VP=vp < /PP-CLR/=pp\n"
+ "relabel vp MWV\n"
+ "relabel pp MWP\n"
+ "\n");
}