package edu.stanford.nlp.trees;
import java.io.FileFilter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.ling.TaggedWord;
/**
* This utility looks for a given sentence in a file or directory of
* tree files. Options that can be specified are a tag separator used
* on the sentence, the encoding of the file, and a regex to limit the
* files looked for in subdirectorys. For example, if you specify
* -fileRegex ".*parse", then only filenames that end in "parse" will
* be considered.
* <br>
* The first non-option argument given will be the sentence searched
* for. The other arguments are paths in which to look for the
* sentence.
*
* @author John Bauer
*/
public class FindTreebankTree {
public static void main(String[] args) {
// Args specified with -tagSeparator, -encoding, etc are assigned
// to the appropriate option. Otherwise, the first arg found is
// the sentence to look for, and all other args are paths in which
// to look for that sentence.
String needle = "";
String tagSeparator = "_";
String encoding = "utf-8";
String fileRegex = "";
List<String> paths = new ArrayList<>();
for (int i = 0; i < args.length; ++i) {
if ((args[i].equalsIgnoreCase("-tagSeparator") ||
args[i].equalsIgnoreCase("--tagSeparator")) &&
i + 1 < args.length) {
tagSeparator = args[i + 1];
++i;
} else if ((args[i].equalsIgnoreCase("-encoding") ||
args[i].equalsIgnoreCase("--encoding")) &&
i + 1 < args.length) {
encoding = args[i + 1];
++i;
} else if ((args[i].equalsIgnoreCase("-fileRegex") ||
args[i].equalsIgnoreCase("--fileRegex")) &&
i + 1 < args.length) {
fileRegex = args[i + 1];
++i;
} else if (needle.equals("")) {
needle = args[i].trim();
} else {
paths.add(args[i]);
}
}
TreeReaderFactory trf = new LabeledScoredTreeReaderFactory();
// If the user specified a regex, here we make a filter using that
// regex. We just use an anonymous class for the filter
FileFilter filter = null;
if (!fileRegex.equals("")) {
final Pattern filePattern = Pattern.compile(fileRegex);
filter = pathname -> (pathname.isDirectory() ||
filePattern.matcher(pathname.getName()).matches());
}
for (String path : paths) {
// Start a new treebank with the given path, encoding, filter, etc
DiskTreebank treebank = new DiskTreebank(trf, encoding);
treebank.loadPath(path, filter);
Iterator<Tree> treeIterator = treebank.iterator();
int treeCount = 0;
String currentFile = "";
while (treeIterator.hasNext()) {
// the treebank might be a directory, not a single file, so
// keep track of which file we are currently looking at
if (!currentFile.equals(treebank.getCurrentFilename())) {
currentFile = treebank.getCurrentFilename();
treeCount = 0;
}
++treeCount;
Tree tree = treeIterator.next();
List<TaggedWord> sentence = tree.taggedYield();
boolean found = false;
// The tree can match in one of three ways: tagged, untagged,
// or untagged and unsegmented (which is useful for Chinese,
// for example)
String haystack = SentenceUtils.listToString(sentence, true);
found = needle.equals(haystack);
haystack = haystack.replaceAll(" ", "");
found = found || needle.equals(haystack);
haystack = SentenceUtils.listToString(sentence, false, tagSeparator);
found = found || needle.equals(haystack);
if (found) {
System.out.println("needle found in " + currentFile +
" tree " + treeCount);
}
}
}
}
}