package edu.stanford.nlp.international.arabic.pipeline;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeReader;
import edu.stanford.nlp.trees.TreeReaderFactory;
import edu.stanford.nlp.trees.TreeVisitor;
import edu.stanford.nlp.trees.international.arabic.ArabicTreeReaderFactory;
import edu.stanford.nlp.util.Generics;
/**
* Converts all contiguous MWEs listed in an MWE list to flattened trees.
*
* @author Spence Green
*
*/
public class MWETreeVisitorExternal implements TreeVisitor {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(MWETreeVisitorExternal.class);
private static final String mweFile = "/home/rayder441/sandbox/javanlp/projects/core/data/edu/stanford/nlp/pipeline/attia-mwe-list.txt.out.tok.fixed.proc.uniq";
private final Set<String> mweDictionary;
public MWETreeVisitorExternal() {
mweDictionary = loadMWEs();
}
private Set<String> loadMWEs() {
Set<String> mweSet = Generics.newHashSet();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(mweFile), "UTF-8"));
for (String line; (line = br.readLine()) != null;) {
mweSet.add(line.trim());
}
br.close();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return mweSet;
}
/**
* Perform (possibly destructive) operations on the tree. Do a top-down DFS on the tree.
*/
public void visitTree(Tree tree) {
if (tree == null) return;
String yield = SentenceUtils.listToString(tree.yield());
if (mweDictionary.contains(yield)) {
List<Tree> children = getPreterminalSubtrees(tree);
String newLabel = "MW" + tree.value();
tree.setValue(newLabel);
tree.setChildren(children);
// Bottom out of the recursion
return;
} else {
for (Tree subTree : tree.children()) {
if (subTree.isPhrasal()) {
// Only phrasal trees can have yields > 1!!
visitTree(subTree);
}
}
}
}
private List<Tree> getPreterminalSubtrees(Tree tree) {
List<Tree> preterminals = new ArrayList<>();
for (Tree subTree : tree) {
if (subTree.isPreTerminal()) {
preterminals.add(subTree);
}
}
return preterminals;
}
/**
* For debugging.
*
* @param args
*/
public static void main(String[] args) {
if (args.length != 1) {
System.err.printf("Usage: java %s atb_tree_file > atb_tree_file.out%n", MWETreeVisitorExternal.class.getName());
System.exit(-1);
}
TreeReaderFactory trf = new ArabicTreeReaderFactory();
try {
TreeReader tr = trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(args[0]), "UTF-8")));
TreeVisitor visitor = new MWETreeVisitorExternal();
int treeId = 0;
for (Tree tree; (tree = tr.readTree()) != null; ++treeId) {
if (tree.value().equals("ROOT")) {
// Skip over the ROOT tag
tree = tree.firstChild();
}
visitor.visitTree(tree);
System.out.println(tree.toString());
}
tr.close();
System.err.printf("Processed %d trees.%n", treeId);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}