LexicalizedParserITest.java example

Explorer
CoreNLP-master
// LexicalizedParserITest
// Copyright (c) 2002-2010 Leland Stanford Junior University

//This program is free software; you can redistribute it and/or
//modify it under the terms of the GNU General Public License
//as published by the Free Software Foundation; either version 2
//of the License, or (at your option) any later version.

//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU General Public License for more details.

//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

//For more information, bug reports, fixes, contact:
//Christopher Manning
//Dept of Computer Science, Gates 1A
//Stanford CA 94305-9010
//USA
//Support/Questions: java-nlp-user@lists.stanford.edu
//Licensing: java-nlp-support@lists.stanford.edu
//http://www-nlp.stanford.edu/software/tagger.shtml



package edu.stanford.nlp.parser.lexparser;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.SentenceUtils;
import junit.framework.TestCase;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.parser.common.ParserAnnotations;
import edu.stanford.nlp.parser.common.ParserConstraint;
import edu.stanford.nlp.parser.common.ParserQuery;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreePrint;
import edu.stanford.nlp.trees.TreebankLanguagePack;

import java.io.File;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;


/**
 * The purpose of this itest is a simple test to make sure the
 * standard LexicalizedParser parses things in an expected way.  Since
 * simple sentences should be parsed in the same way regardless of
 * updated data files, this mostly tests the interface and that the
 * data file in the default location hasn't drastically changed.
 *
 * @author John Bauer
 */
public class LexicalizedParserITest extends TestCase {

  private static LexicalizedParser englishParser = null;
  private static TreePrint tagPrint = null;
  private static TreePrint pennPrint = null;
  private static TreePrint typDepPrint = null;
  private static TreePrint typDepColPrint = null;

  private static LexicalizedParser chineseParser = null;
  private static TreePrint chinesePennPrint = null;
  private static TreePrint chineseTypDepPrint = null;

  // TODO: add more tests

  @Override
  public void setUp() throws Exception {
    synchronized(LexicalizedParserITest.class) {
      if (englishParser == null) {
        // sharing a bunch of code here with the webapp in
        // parser/webapp/index.jsp...  perhaps we could reuse that code
        englishParser = LexicalizedParser.loadModel();
        TreebankLanguagePack tLP =
          englishParser.getOp().tlpParams.treebankLanguagePack();
        tagPrint = new TreePrint("wordsAndTags", tLP);
        pennPrint = new TreePrint("penn", tLP);
        typDepPrint = new TreePrint("typedDependencies", "basicDependencies", tLP);
        typDepColPrint = new TreePrint("typedDependencies", tLP);  // default is now CCprocessed

        File englishPath = new File(LexicalizedParser.DEFAULT_PARSER_LOC);
        String chinesePath = (englishPath.getParent() + File.separator +
                              "chineseFactored.ser.gz");
        chineseParser = LexicalizedParser.loadModel(chinesePath);
        tLP = chineseParser.getOp().tlpParams.treebankLanguagePack();
        chineseParser.getTLPParams().setGenerateOriginalDependencies(true); // test was made with Chinese SD not UD

        chinesePennPrint = new TreePrint("penn", tLP);
        chineseTypDepPrint = new TreePrint("typedDependencies", "basicDependencies", tLP);
      }
    }
  }

  /**
   * Compares one view of the result tree to the expected results.
   *
   * Setting outputResults to true makes it print out the results.
   * This is useful because assertEquals sometimes abbreviates the
   * strings on failure, which makes it hard to diagnose.
   */
  private static void compareSingleOutput(Tree results, boolean outputResults,
                                          TreePrint printer,
                                          String expectedOutput) {
    StringWriter sw = new StringWriter();
    printer.printTree(results, (new PrintWriter(sw)));
    if (expectedOutput != null) {
      expectedOutput = expectedOutput.replaceAll("\\s+", " ").trim();
    }
    String actualOutput = sw.toString().replaceAll("\\s+", " ").trim();
    if (outputResults) {
      if (expectedOutput != null) {
        System.out.println(expectedOutput);
      }
      System.out.println(actualOutput);
    }
    if (expectedOutput != null) {
      assertEquals(expectedOutput, actualOutput);
    }
  }

  /**
   * Given a tree and a bunch of expected strings, this method takes
   * that tree and compares its components to the expected output by
   * printing the tree in a few different ways.  There are probably
   * better ways of testing the trees, ie by comparing the tree
   * directly instead of printing it out, but printing it also makes
   * the output very easy to inspect visually.
   *
   * Setting outputResults to true makes it print out the results.
   */
  private static void compareOutput(Tree results, boolean outputResults,
                                    String expectedTags,
                                    String expectedPenn,
                                    String expectedDep,
                                    String expectedDepCol) {
    compareSingleOutput(results, outputResults, tagPrint, expectedTags);
    compareSingleOutput(results, outputResults, pennPrint, expectedPenn);
    compareSingleOutput(results, outputResults, typDepPrint, expectedDep);
    compareSingleOutput(results, outputResults, typDepColPrint, expectedDepCol);
  }

  private static List<CoreLabel> sampleSausage() {
    String[] words = {"My", "dog", "also", "likes", "eating", "sausage", "."};
    return SentenceUtils.toCoreLabelList(words);
  }

  /**
   * This method tests a very basic string and a few different results
   * that parsing that string should come up with.
   */
  public void testParseString() {
    Tree results = englishParser.parse("My dog likes to eat yoghurt.");
    compareOutput(results, false,
                  "My/PRP$ dog/NN likes/VBZ to/TO eat/VB yoghurt/NN ./.",
                  "(ROOT (S (NP (PRP$ My) (NN dog)) (VP (VBZ likes) (S (VP (TO to) (VP (VB eat) (NP (NN yoghurt)))))) (. .)))",
                  "nmod:poss(dog-2, My-1) nsubj(likes-3, dog-2) root(ROOT-0, likes-3) mark(eat-5, to-4) xcomp(likes-3, eat-5) dobj(eat-5, yoghurt-6)",
                  "nmod:poss(dog-2, My-1) nsubj(likes-3, dog-2) nsubj:xsubj(eat-5, dog-2) root(ROOT-0, likes-3) mark(eat-5, to-4) xcomp(likes-3, eat-5) dobj(eat-5, yoghurt-6)");
  }

  /**
   * Test the query structure that you can use for better control of
   * the parse
   */
  public void testParserQuery() {
    List<CoreLabel> sentence = sampleSausage();
    ParserQuery pq = englishParser.parserQuery();
    pq.parse(sentence);
    compareSingleOutput(pq.getBestParse(), false, pennPrint,
                        "(ROOT (S (NP (PRP$ My) (NN dog)) (ADVP (RB also)) (VP (VBZ likes) (S (VP (VBG eating) (NP (NN sausage))))) (. .)))");
  }

  public void testParseMultiple() {
    List<List<CoreLabel>> sentences = new ArrayList<>();
    sentences.add(SentenceUtils.toCoreLabelList("The", "Flyers", "lost", "again", "last", "night", "."));
    sentences.add(SentenceUtils.toCoreLabelList("If", "this", "continues", ",", "they", "will", "miss", "the", "playoffs", "."));
    sentences.add(SentenceUtils.toCoreLabelList("Hopefully", "they", "can", "turn", "it", "around", "."));
    sentences.add(SentenceUtils.toCoreLabelList("Winning", "on", "Wednesday", "would", "be", "a", "good", "first", "step", "."));
    sentences.add(SentenceUtils.toCoreLabelList("Their", "next", "opponent", "is", "quite", "bad", "."));

    List<Tree> results1 = englishParser.parseMultiple(sentences);
    List<Tree> results2 = englishParser.parseMultiple(sentences, 3);

    assertEquals(results1, results2);
  }

  /**
   * Test what happens if you put a constraint on the parse
   */
  public void testConstraints() {
    List<CoreLabel> sentence = sampleSausage();

    ParserQuery pq = englishParser.parserQuery();

    ParserConstraint constraint =
      new ParserConstraint(0, 2, "SBAR|SBAR[^a-zA-Z].*");
    List<ParserConstraint> constraints = new ArrayList<>();
    constraints.add(constraint);
    pq.setConstraints(constraints);

    pq.parse(sentence);

    StringWriter sw = new StringWriter();
    pennPrint.printTree(pq.getBestParse(), (new PrintWriter(sw)));
    String actualOutput = sw.toString().replaceAll("\\s+", " ").trim();

    String expectedOutput = "(ROOT (S (NP (PRP$ My) (NN dog)) (ADVP (RB also)) (VP (VBZ likes) (S (VP (VBG eating) (NP (NN sausage))))) (. .)))";
    expectedOutput = expectedOutput.replaceAll("\\s+", " ").trim();

    // Not exactly sure what should come back, but it shouldn't be the
    // original output any more
    assertFalse("Tree should not match the original tree any more",
                expectedOutput.equals(actualOutput));
    assertTrue("Tree should be forced to contain SBAR",
            actualOutput.contains("SBAR"));

    //System.out.println(pq.getBestParse());
  }

  private static final String chineseTest = "我 看 了 一 条 狗";
  private static final String expectedChineseTree = "(ROOT (IP (NP (PN 我)) (VP (VV 看) (AS 了) (NP (QP (CD 一) (CLP (M 条))) (NP (NN 狗))))))";
  private static final String expectedChineseDeps = "nsubj(看-2, 我-1) root(ROOT-0, 看-2) asp(看-2, 了-3) nummod(条-5, 一-4) clf(狗-6, 条-5) dobj(看-2, 狗-6)";

  public static void testChineseDependencies() {
    Tree tree = chineseParser.parse(chineseTest);
    compareSingleOutput(tree, false, chinesePennPrint, expectedChineseTree);
    compareSingleOutput(tree, false, chineseTypDepPrint, expectedChineseDeps);
  }

  private static final String chineseTest2 = "这里 是 新闻 之 夜 ．";
  private static final String expectedChineseTree2 = "(ROOT (IP (NP (PN 这里)) (VP (VC 是) (NP (DNP (NP (NN 新闻)) (DEG 之)) (NP (NN 夜)))) (PU ．)))";
  /** This is the right answer for Chinese UD. */
  private static final String expectedChineseDeps2 = "nsubj(夜-5, 这里-1) cop(夜-5, 是-2) assmod(夜-5, 新闻-3) case(新闻-3, 之-4) root(ROOT-0, 夜-5)";
  /** This is the right answer for old-style Chinese SD. */
  private static final String expectedChineseDeps2sd = "top(是-2, 这里-1) root(ROOT-0, 是-2) assmod(夜-5, 新闻-3) assm(新闻-3, 之-4) attr(是-2, 夜-5)";

  public void testChineseDependenciesSemanticHead() {
    Tree tree = chineseParser.parse(chineseTest2);
    compareSingleOutput(tree, false, chinesePennPrint, expectedChineseTree2);
    compareSingleOutput(tree, false, chineseTypDepPrint, expectedChineseDeps2sd);
    TreePrint paramsTreePrint = new TreePrint("typedDependencies", "basicDependencies", chineseParser.treebankLanguagePack(), chineseParser.getTLPParams().headFinder(), chineseParser.getTLPParams().typedDependencyHeadFinder());
    compareSingleOutput(tree, false, paramsTreePrint, expectedChineseDeps2sd);
  }

  public void testAlreadyTagged() {
    List<CoreLabel> words = SentenceUtils.toCoreLabelList("foo", "bar", "baz");
    words.get(1).setTag("JJ");
    Tree tree = englishParser.parse(words);
    assertEquals("JJ", tree.taggedYield().get(1).tag());

    words.get(1).setTag("NN");
    tree = englishParser.parse(words);
    assertEquals("NN", tree.taggedYield().get(1).tag());
  }

  public void testTagRegex() {
    List<CoreLabel> words = SentenceUtils.toCoreLabelList("foo", "bar", "baz");
    words.get(1).set(ParserAnnotations.CandidatePartOfSpeechAnnotation.class, "JJ");
    Tree tree = englishParser.parse(words);
    assertEquals("JJ", tree.taggedYield().get(1).tag());

    words.get(1).set(ParserAnnotations.CandidatePartOfSpeechAnnotation.class, "NN|NNP");
    tree = englishParser.parse(words);
    assertTrue(tree.taggedYield().get(1).tag().equals("NN") ||
            tree.taggedYield().get(1).tag().equals("NNP"));
  }

  public void testCharOffsets() {
    String text = "  You can  eat fruits   such as apples and   oranges.";
    String[] tokens = { "You", "can", "eat", "fruits", "such", "as", "apples", "and", "oranges", "." };
    int[] begins = { 2, 6, 11, 15, 24, 29, 32, 39, 45, 52 };
    int[] ends = { 5, 9, 14, 21, 28, 31, 38, 42, 52, 53 };

    Tree tree = englishParser.parse(text);

    List<CoreLabel> yield = tree.yield(new ArrayList<CoreLabel>());

    assertEquals("Wrong number of tokens in parser output", tokens.length, yield.size());

    int i = 0;
    for (CoreLabel cl : yield) {
      assertEquals("Wrong token", tokens[i], cl.word());
      assertEquals("Wrong char begin", begins[i], (int) cl.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
      assertEquals("Wrong char end", ends[i], (int) cl.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
      i++;
    }
  }


}