SpanishTreeNormalizerITest.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.trees.international.spanish;

import java.io.IOException;
import java.io.StringReader;

import junit.framework.TestCase;

import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.PennTreeReader;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.util.Pair;

/**
 * @author Jon Gauthier
 */
public class SpanishTreeNormalizerITest extends TestCase {

  private TreeFactory tf;
  private SpanishTreeNormalizer tn;

  public void setUp() {
    tf = new LabeledScoredTreeFactory();
    tn = new SpanishTreeNormalizer(true, true, true);
  }

  @SuppressWarnings("unchecked")
  Pair<String, String>[] multiWordTestCases = new Pair[] {
    // Simplest case
    new Pair("(a (b c_d))",
             "(a (MW_PHRASE?_b (MW? c) (MW? d)))"),

    // New MW phrase should merge with grup.nom head
    new Pair("(grup.nom (np00000 Josep_Maria_Ollé))",
             "(MW_PHRASE?_np00000 (MW? Josep) (MW? Maria) (MW? Ollé))"),

    // Likewise here: new MW phrase should merge with grup.nom head
    new Pair("(grup.nom (grup.nom (nc0p000 productos)) (sp (prep (sp000 de)) (sn (grup.nom (np00000 American_Online)))))",
             "(grup.nom (grup.nom (nc0p000 productos)) (sp (prep (sp000 de)) (sn (MW_PHRASE?_np00000 (MW? American) (MW? Online)))))"),

    // Two multi-word tokens as siblings
    new Pair("(a (b c_d) (b e_f))",
             "(a (MW_PHRASE?_b (MW? c) (MW? d)) (MW_PHRASE?_b (MW? e) (MW? f)))"),

    // Quotation mark "words" should be separated
    new Pair("(a (b \"cde\"))",
             "(a (MW_PHRASE?_b (MW? \") (MW? cde) (MW? \")))"),

    // Hyphenated expression should be separated, with hyphen retained
    new Pair("(a (b tecno-pop))",
             "(a (MW_PHRASE?_b (MW? tecno) (MW? -) (MW? pop)))"),

    // Hyphenated expression with bound morpheme should not be separated
    new Pair("(a (b co-promotora))",
             "(a (b co-promotora))"),

    // Don't bork when we see a bound morpheme without following hyphen
    new Pair("(a (b co) (b promotora))",
             "(a (b co) (b promotora))"),

    // Don't treat commas as multiword separators if they are part of a
    // decimal number expression
    new Pair("(a (b 8,39))", "(a (b 8,39))"),
    new Pair("(a (b 28,91%))", "(a (MW_PHRASE?_b (MW? 28,91) (MW? %)))"),

    // But do treat commas as multiword separators otherwise
    new Pair("(a (b entonces,_yo))", "(a (MW_PHRASE?_b (MW? entonces) (MW? ,) (MW? yo)))"),
  };

  public void testMultiWordNormalization() {
    for (Pair<String, String> testCase : multiWordTestCases) {
      Tree head = readTree(testCase.first());
      for (Tree t : head) {
        if (t.isPrePreTerminal())
          tn.normalizeForMultiWord(t, tf);
      }

      assertEquals(testCase.second(), head.toString());
    }
  }

  /**
   * Read a tree from a PTB-style serialized form in the given string.
   */
  private Tree readTree(String treeRep) {
    try {
      return new PennTreeReader(new StringReader(treeRep), tf).readTree();
    } catch (IOException e) { return null; }
  }

}