package info.ephyra.nlp;
import info.ephyra.util.Properties;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import edu.cmu.lti.javelin.util.DeltaRangeMap;
import edu.cmu.lti.javelin.util.RangeMap;
import edu.stanford.nlp.ling.MapLabel;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.trees.PennTreebankLanguagePack;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreebankLanguagePack;
/**
* Wrapper for the Stanford parser.
*
* @author Justin Betteridge, Nico Schlaefer
* @version 2007-10-30
*/
public class StanfordParser
{
protected static final Logger log = Logger.getLogger(StanfordParser.class);
protected static final Pattern whitespace_pattern = Pattern.compile("\\s+");
protected static final Pattern escaped_char_pattern = Pattern.compile("\\\\/");
protected static final Pattern double_quote_lable_pattern = Pattern.compile("[`'][`']");
protected static final Pattern bracket_label_pattern = Pattern.compile("-...-");
public static final String BEGIN_KEY = "begin";
public static final String END_KEY = "end";
protected static class MutableInteger {
public int value;
public MutableInteger() { value = 0; }
public MutableInteger(int i) { value = i; }
public String toString() { return Integer.toString(value); }
public int getValue() { return value; }
public void setValue(int i) { value = i; }
}
protected static TreebankLanguagePack tlp = null;
protected static LexicalizedParser parser = null;
/**
* Hide default ctor.
*/
protected StanfordParser() {}
/**
* Initializes static resources.
*
* @throws Exception
*/
public static void initialize() throws Exception
{
if (parser != null) return;
Properties properties = Properties.loadFromClassName(StanfordParser.class.getName());
tlp = new PennTreebankLanguagePack();
String modelFile = properties.getProperty("modelFile");
if (modelFile == null)
throw new Exception("Required property '"
+ "modelFile' is undefined");
parser = new LexicalizedParser(modelFile);
}
/**
* Unloads static resources.
*
* @throws Exception
*/
public static void destroy() throws Exception
{
tlp = null;
parser = null;
}
/**
* Parses a sentence and returns a string representation of the parse tree.
*
* @param sentence a sentence
* @return Tree whose Label is a MapLabel containing correct begin and end
* character offsets in keys BEGIN_KEY and END_KEY
*/
@SuppressWarnings("unchecked")
public static String parse(String sentence)
{
if (tlp == null || parser == null)
throw new RuntimeException("Parser has not been initialized");
// parse the sentence to produce stanford Tree
log.debug("Parsing sentence");
Tree tree = null;
synchronized (parser) {
Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence));
List<Word> words = tokenizer.tokenize();
log.debug("Tokenization: "+words);
parser.parse(new Sentence(words));
tree = parser.getBestParse();
}
// label tree with character extents
//log.debug("Setting character extents");
//updateTreeLabels(tree, tree, new MutableInteger(), new MutableInteger(-1));
//log.debug("Creating offset mapping");
//List<RangeMap> mapping = createMapping(sentence);
//log.debug(mapping.toString());
//log.debug("Applying offset mapping");
//mapOffsets(tree, mapping);
return tree.toString().replaceAll(" \\[[\\S]+\\]","");
}
/**
* Parses a sentence and returns the PCFG score as a confidence measure.
*
* @param sentence a sentence
* @return PCFG score
*/
@SuppressWarnings("unchecked")
public static double getPCFGScore(String sentence) {
if (tlp == null || parser == null)
throw new RuntimeException("Parser has not been initialized");
// parse the sentence to produce PCFG score
log.debug("Parsing sentence");
double score;
synchronized (parser) {
Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence));
List<Word> words = tokenizer.tokenize();
log.debug("Tokenization: "+words);
parser.parse(new Sentence(words));
score = parser.getPCFGScore();
}
return score;
}
protected static void updateTreeLabels(Tree root, Tree tree, MutableInteger offset, MutableInteger leafIndex)
{
if (tree.isLeaf()) {
leafIndex.value++;
return;
}
String labelValue = tree.label().value().toUpperCase();
int begin = root.leftCharEdge(tree);
int end = root.rightCharEdge(tree);
//System.out.println(labelValue+"("+begin+","+end+")");
int length = end - begin;
// apply offset to begin extent
begin += offset.value;
// calculate offset delta based on label
if (double_quote_lable_pattern.matcher(labelValue).matches() && length > 1) {
offset.value--;
log.debug("Quotes label pattern fired: "+offset);
} else if (bracket_label_pattern.matcher(labelValue).matches()) {
offset.value -= 4;
log.debug("Bracket label pattern fired: "+offset);
} else if (tree.isPreTerminal()) {
Tree leaf = tree.firstChild();
String text = leaf.label().value();
Matcher matcher = escaped_char_pattern.matcher(text);
while (matcher.find()) {
offset.value--;
}
}
for (Tree child : tree.children())
updateTreeLabels(root, child, offset, leafIndex);
// apply offset to end extent
end += offset.value;
// set begin and end offsets on node
MapLabel label = new MapLabel(tree.label());
label.put(BEGIN_KEY, begin);
label.put(END_KEY, end);
label.put(MapLabel.INDEX_KEY, leafIndex.value);
tree.setLabel(label);
}
/**
* @param sentence
* @return a list of RangeMap objects which define a mapping of character
* offsets in a white-space depleted version of the input string back into
* offsets in the input string.
*/
protected static List<RangeMap> createMapping(String sentence)
{
List<RangeMap> mapping = new LinkedList<RangeMap>();
Matcher whitespace_matcher = whitespace_pattern.matcher(sentence);
DeltaRangeMap delta_rmap = null;
// find all sequences of whitespace chars
while (whitespace_matcher.find()) {
int start = whitespace_matcher.start();
int end = whitespace_matcher.end();
int length = end - start;
if (delta_rmap == null) {
// create a new RangeMap object whose start begins at current
// match start, and whose end is at the moment undefined. The
// delta here is taken to be the length of the whitespace
// sequence.
delta_rmap = new DeltaRangeMap(start, 0, length);
} else {
// we've found the next sequence of whitespace chars, so we
// finalize the end extent of the previous RangeMap, and make a
// new RangeMap to describe the mapping from this point forward.
delta_rmap.end = start - delta_rmap.delta;
mapping.add(delta_rmap);
delta_rmap = new DeltaRangeMap(delta_rmap.end, 0, delta_rmap.delta + length);
}
}
// process trailing DeltaRangeMap if it exists
if (delta_rmap != null) {
delta_rmap.end = sentence.length() - delta_rmap.delta;
mapping.add(delta_rmap);
}
return mapping;
}
/**
* Maps Tree node offsets using provided mapping.
* @param tree the Tree whose begin and end extents should be mapped.
* @param mapping the list of RangeMap objects which defines the mapping.
*/
protected static void mapOffsets(Tree tree, List<RangeMap> mapping)
{
// if mapping is empty, then assume 1-to-1 mapping.
if (mapping == null || mapping.size() == 0) return;
int begin_map_index = 0;
RangeMap begin_rmap = mapping.get(begin_map_index);
TREE: for (Tree t : tree) {
if (t.isLeaf()) continue;
MapLabel label = (MapLabel) t.label();
int begin = (Integer) label.get(BEGIN_KEY);
// "end" must be index of last char in range
int end = (Integer) label.get(END_KEY) - 1;
// find the first rangemap whose end is greater than the
// beginning of current annotation.
// log.debug("Finding RangeMap whose extents include
// annotation.begin");
while (begin_rmap.end <= begin) {
begin_map_index++;
if (begin_map_index >= mapping.size()) break TREE;
begin_rmap = mapping.get(begin_map_index);
}
// if beginning of current rangemap is greater than end of
// current annotation, then skip this annotation (default
// mapping is 1-to-1).
if (begin_rmap.begin > end) {
// log.debug("Skipping annotation (assuming 1-to-1 offset
// mapping)");
continue;
}
// if beginning of current annotation falls within current range
// map, then map it back to source space.
int new_begin = begin;
if (begin_rmap.begin <= new_begin) {
// log.debug("Applying RangeMap to begin offset");
new_begin = begin_rmap.map(new_begin);
}
// find the first rangemap whose end is greater than the end of
// current annotation.
// log.debug("Finding RangeMap whose extents include
// annotation.end");
int end_map_index = begin_map_index;
RangeMap end_rmap = begin_rmap;
END_OFFSET: while (end_rmap.end <= end) {
end_map_index++;
if (end_map_index >= mapping.size()) break END_OFFSET;
end_rmap = mapping.get(end_map_index);
}
// if end of current annotation falls within "end" range map,
// then map it back to source space.
int new_end = end;
if (end_rmap.begin <= end) {
// log.debug("Applying RangeMap to end offset");
new_end = end_rmap.map(end);
}
label.put(BEGIN_KEY, new_begin);
label.put(END_KEY, new_end + 1);
}
}
// private static void printOffsets(String sentence, Tree tree)
// {
// if (tree.isLeaf()) return;
// MapLabel label = (MapLabel) tree.label();
// int begin = (Integer) label.get(BEGIN_KEY);
// int end = (Integer) label.get(END_KEY);
// int index = (Integer) label.index();
// String str = null;
// if (begin < 0 || begin > sentence.length() || end < begin || end > sentence.length()) {
// str = "error";
// } else {
// str = sentence.substring(begin, end);
// }
// System.out.println(label.value()+"("+index+":"+begin+","+end+"): "+str);
// for (Tree child : tree.children())
// printOffsets(sentence, child);
// }
public static void main(String[] args) throws Exception
{
if (args.length != 1) {
System.out.println("USAGE: StanfordParser <inputSentencesFile>");
System.out.println("Output stored in: <inputSentencesFile>.parses");
System.exit(0);
}
StanfordParser.initialize();
List<String> sentences = new ArrayList<String> ();
BufferedReader in = new BufferedReader(new FileReader(args[0]));
BufferedWriter out = new BufferedWriter(new FileWriter(args[0]+".parses"));
String sentence;
while ((sentence = in.readLine()) != null) {
sentences.add(sentence);
}
for (String s : sentences) {
out.append(StanfordParser.parse(s)+"\n");
}
out.close();
in.close();
}
}