StanfordParser.java example

Explorer
lucida-master
- lucida
package info.ephyra.nlp;

import info.ephyra.util.Properties;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;

import edu.cmu.lti.javelin.util.DeltaRangeMap;
import edu.cmu.lti.javelin.util.RangeMap;
import edu.stanford.nlp.ling.MapLabel;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.trees.PennTreebankLanguagePack;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreebankLanguagePack;

/**
 * Wrapper for the Stanford parser.
 * 
 * @author Justin Betteridge, Nico Schlaefer
 * @version 2007-10-30
 */
public class StanfordParser
{
    protected static final Logger log = Logger.getLogger(StanfordParser.class);
    protected static final Pattern whitespace_pattern = Pattern.compile("\\s+");
    protected static final Pattern escaped_char_pattern = Pattern.compile("\\\\/");
    protected static final Pattern double_quote_lable_pattern = Pattern.compile("[`'][`']");
    protected static final Pattern bracket_label_pattern = Pattern.compile("-...-");

    public static final String BEGIN_KEY = "begin";
    public static final String END_KEY = "end";
    
    protected static class MutableInteger {
        public int value;
        public MutableInteger() { value = 0; }
        public MutableInteger(int i) { value = i; }
        public String toString() { return Integer.toString(value); }
        public int getValue() { return value; }
        public void setValue(int i) { value = i; }
    }

    protected static TreebankLanguagePack tlp = null;
    protected static LexicalizedParser parser = null;

    /**
     * Hide default ctor.
     */
    protected StanfordParser() {}

    /**
     * Initializes static resources.
     * 
     * @throws Exception
     */
    public static void initialize() throws Exception
    {
        if (parser != null) return;
        Properties properties = Properties.loadFromClassName(StanfordParser.class.getName());
        tlp = new PennTreebankLanguagePack();
        String modelFile = properties.getProperty("modelFile");
        if (modelFile == null)
            throw new Exception("Required property '" 
                + "modelFile' is undefined");
        parser = new LexicalizedParser(modelFile);
    }

    /**
     * Unloads static resources.
     * 
     * @throws Exception
     */
    public static void destroy() throws Exception
    {
        tlp = null;
        parser = null;
    }
    
    /**
     * Parses a sentence and returns a string representation of the parse tree.
     * 
     * @param sentence a sentence
     * @return Tree whose Label is a MapLabel containing correct begin and end
     * character offsets in keys BEGIN_KEY and END_KEY
     */
	@SuppressWarnings("unchecked")
    public static String parse(String sentence)
    {
        if (tlp == null || parser == null)
            throw new RuntimeException("Parser has not been initialized");
        
        // parse the sentence to produce stanford Tree
        log.debug("Parsing sentence");
        Tree tree = null;
        synchronized (parser) {
            Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence));
            List<Word> words = tokenizer.tokenize();
            log.debug("Tokenization: "+words);
            parser.parse(new Sentence(words));
            tree = parser.getBestParse();
        }
        
        // label tree with character extents
        //log.debug("Setting character extents");
        //updateTreeLabels(tree, tree, new MutableInteger(), new MutableInteger(-1));
        //log.debug("Creating offset mapping");
        //List<RangeMap> mapping = createMapping(sentence);
        //log.debug(mapping.toString());
        //log.debug("Applying offset mapping");
        //mapOffsets(tree, mapping);
        
        return tree.toString().replaceAll(" \\[[\\S]+\\]","");
    }
	
	/**
	 * Parses a sentence and returns the PCFG score as a confidence measure.
	 * 
	 * @param sentence a sentence
	 * @return PCFG score
	 */
	@SuppressWarnings("unchecked")
	public static double getPCFGScore(String sentence) {
        if (tlp == null || parser == null)
            throw new RuntimeException("Parser has not been initialized");
        
        // parse the sentence to produce PCFG score
        log.debug("Parsing sentence");
        double score;
        synchronized (parser) {
            Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence));
            List<Word> words = tokenizer.tokenize();
            log.debug("Tokenization: "+words);
            parser.parse(new Sentence(words));
            score = parser.getPCFGScore();
        }
        
        return score;
	}
    
    protected static void updateTreeLabels(Tree root, Tree tree, MutableInteger offset, MutableInteger leafIndex)
    {
        if (tree.isLeaf()) {
            leafIndex.value++;
            return;
        }
        String labelValue = tree.label().value().toUpperCase();
        int begin = root.leftCharEdge(tree);
        int end = root.rightCharEdge(tree);
        //System.out.println(labelValue+"("+begin+","+end+")");
        int length = end - begin;
        
        // apply offset to begin extent
        begin += offset.value;
        
        // calculate offset delta based on label
        if (double_quote_lable_pattern.matcher(labelValue).matches() && length > 1) {
            offset.value--;
            log.debug("Quotes label pattern fired: "+offset);
        } else if (bracket_label_pattern.matcher(labelValue).matches()) {
            offset.value -= 4;
            log.debug("Bracket label pattern fired: "+offset);
        } else if (tree.isPreTerminal()) {
            Tree leaf = tree.firstChild();
            String text = leaf.label().value();
            Matcher matcher = escaped_char_pattern.matcher(text);
            while (matcher.find()) {
                offset.value--;
            }
        }
        
        for (Tree child : tree.children())
            updateTreeLabels(root, child, offset, leafIndex);

        // apply offset to end extent
        end += offset.value;

        // set begin and end offsets on node
        MapLabel label = new MapLabel(tree.label());
        label.put(BEGIN_KEY, begin);
        label.put(END_KEY, end);
        label.put(MapLabel.INDEX_KEY, leafIndex.value);
        tree.setLabel(label);
    }

    /**
     * @param sentence
     * @return a list of RangeMap objects which define a mapping of character
     * offsets in a white-space depleted version of the input string back into
     * offsets in the input string.
     */
    protected static List<RangeMap> createMapping(String sentence)
    {
        List<RangeMap> mapping = new LinkedList<RangeMap>();
        Matcher whitespace_matcher = whitespace_pattern.matcher(sentence);
        DeltaRangeMap delta_rmap = null;

        // find all sequences of whitespace chars
        while (whitespace_matcher.find()) {
            int start = whitespace_matcher.start();
            int end = whitespace_matcher.end();
            int length = end - start;

            if (delta_rmap == null) {
                // create a new RangeMap object whose start begins at current
                // match start, and whose end is at the moment undefined. The
                // delta here is taken to be the length of the whitespace
                // sequence.
                delta_rmap = new DeltaRangeMap(start, 0, length);
            } else {
                // we've found the next sequence of whitespace chars, so we
                // finalize the end extent of the previous RangeMap, and make a
                // new RangeMap to describe the mapping from this point forward.
                delta_rmap.end = start - delta_rmap.delta;
                mapping.add(delta_rmap);
                delta_rmap = new DeltaRangeMap(delta_rmap.end, 0, delta_rmap.delta + length);
            }
        }

        // process trailing DeltaRangeMap if it exists
        if (delta_rmap != null) {
            delta_rmap.end = sentence.length() - delta_rmap.delta;
            mapping.add(delta_rmap);
        }

        return mapping;
    }

    /**
     * Maps Tree node offsets using provided mapping.
     * @param tree the Tree whose begin and end extents should be mapped.
     * @param mapping the list of RangeMap objects which defines the mapping.
     */
    protected static void mapOffsets(Tree tree, List<RangeMap> mapping)
    {
        // if mapping is empty, then assume 1-to-1 mapping.
        if (mapping == null || mapping.size() == 0) return;

        int begin_map_index = 0;
        RangeMap begin_rmap = mapping.get(begin_map_index);
        TREE: for (Tree t : tree) {
            if (t.isLeaf()) continue;
            MapLabel label = (MapLabel) t.label();
            int begin = (Integer) label.get(BEGIN_KEY);
            // "end" must be index of last char in range            
            int end = (Integer) label.get(END_KEY) - 1;

            // find the first rangemap whose end is greater than the
            // beginning of current annotation.
            // log.debug("Finding RangeMap whose extents include
            // annotation.begin");
            while (begin_rmap.end <= begin) {
                begin_map_index++;
                if (begin_map_index >= mapping.size()) break TREE;
                begin_rmap = mapping.get(begin_map_index);
            }

            // if beginning of current rangemap is greater than end of
            // current annotation, then skip this annotation (default
            // mapping is 1-to-1).
            if (begin_rmap.begin > end) {
                // log.debug("Skipping annotation (assuming 1-to-1 offset
                // mapping)");
                continue;
            }

            // if beginning of current annotation falls within current range
            // map, then map it back to source space.
            int new_begin = begin;
            if (begin_rmap.begin <= new_begin) {
                // log.debug("Applying RangeMap to begin offset");
                new_begin = begin_rmap.map(new_begin);
            }

            // find the first rangemap whose end is greater than the end of
            // current annotation.
            // log.debug("Finding RangeMap whose extents include
            // annotation.end");
            int end_map_index = begin_map_index;
            RangeMap end_rmap = begin_rmap;
            END_OFFSET: while (end_rmap.end <= end) {
                end_map_index++;
                if (end_map_index >= mapping.size()) break END_OFFSET;
                end_rmap = mapping.get(end_map_index);
            }

            // if end of current annotation falls within "end" range map,
            // then map it back to source space.
            int new_end = end;
            if (end_rmap.begin <= end) {
                // log.debug("Applying RangeMap to end offset");
                new_end = end_rmap.map(end);
            }

            label.put(BEGIN_KEY, new_begin);
            label.put(END_KEY, new_end + 1);
        }
    }
    
//  private static void printOffsets(String sentence, Tree tree)
//  {
//      if (tree.isLeaf()) return;
//      MapLabel label = (MapLabel) tree.label();
//      int begin = (Integer) label.get(BEGIN_KEY);
//      int end = (Integer) label.get(END_KEY);
//      int index = (Integer) label.index();
//      String str = null;
//      if (begin < 0 || begin > sentence.length() || end < begin || end > sentence.length()) {
//          str = "error";
//      } else {
//          str = sentence.substring(begin, end);
//      }
//      System.out.println(label.value()+"("+index+":"+begin+","+end+"): "+str);
//      for (Tree child : tree.children())
//          printOffsets(sentence, child);
//  }
    
    public static void main(String[] args) throws Exception
    {
        if (args.length != 1) {
            System.out.println("USAGE: StanfordParser <inputSentencesFile>");
            System.out.println("Output stored in: <inputSentencesFile>.parses");
            System.exit(0);
        }
        StanfordParser.initialize();
        List<String> sentences = new ArrayList<String> ();
        BufferedReader in = new BufferedReader(new FileReader(args[0]));
        BufferedWriter out = new BufferedWriter(new FileWriter(args[0]+".parses"));
        String sentence;
        while ((sentence = in.readLine()) != null) {
            sentences.add(sentence);
        }
        for (String s : sentences) {
            out.append(StanfordParser.parse(s)+"\n");
        }
        out.close();
        in.close();
    }
}