package edu.stanford.nlp.trees.international.spanish; import java.io.IOException; import java.io.StringReader; import junit.framework.TestCase; import edu.stanford.nlp.trees.LabeledScoredTreeFactory; import edu.stanford.nlp.trees.PennTreeReader; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeFactory; import edu.stanford.nlp.util.Pair; /** * @author Jon Gauthier */ public class SpanishTreeNormalizerITest extends TestCase { private TreeFactory tf; private SpanishTreeNormalizer tn; public void setUp() { tf = new LabeledScoredTreeFactory(); tn = new SpanishTreeNormalizer(true, true, true); } @SuppressWarnings("unchecked") Pair<String, String>[] multiWordTestCases = new Pair[] { // Simplest case new Pair("(a (b c_d))", "(a (MW_PHRASE?_b (MW? c) (MW? d)))"), // New MW phrase should merge with grup.nom head new Pair("(grup.nom (np00000 Josep_Maria_Ollé))", "(MW_PHRASE?_np00000 (MW? Josep) (MW? Maria) (MW? Ollé))"), // Likewise here: new MW phrase should merge with grup.nom head new Pair("(grup.nom (grup.nom (nc0p000 productos)) (sp (prep (sp000 de)) (sn (grup.nom (np00000 American_Online)))))", "(grup.nom (grup.nom (nc0p000 productos)) (sp (prep (sp000 de)) (sn (MW_PHRASE?_np00000 (MW? American) (MW? Online)))))"), // Two multi-word tokens as siblings new Pair("(a (b c_d) (b e_f))", "(a (MW_PHRASE?_b (MW? c) (MW? d)) (MW_PHRASE?_b (MW? e) (MW? f)))"), // Quotation mark "words" should be separated new Pair("(a (b \"cde\"))", "(a (MW_PHRASE?_b (MW? \") (MW? cde) (MW? \")))"), // Hyphenated expression should be separated, with hyphen retained new Pair("(a (b tecno-pop))", "(a (MW_PHRASE?_b (MW? tecno) (MW? -) (MW? pop)))"), // Hyphenated expression with bound morpheme should not be separated new Pair("(a (b co-promotora))", "(a (b co-promotora))"), // Don't bork when we see a bound morpheme without following hyphen new Pair("(a (b co) (b promotora))", "(a (b co) (b promotora))"), // Don't treat commas as multiword separators if they are part of a // decimal number expression new Pair("(a (b 8,39))", "(a (b 8,39))"), new Pair("(a (b 28,91%))", "(a (MW_PHRASE?_b (MW? 28,91) (MW? %)))"), // But do treat commas as multiword separators otherwise new Pair("(a (b entonces,_yo))", "(a (MW_PHRASE?_b (MW? entonces) (MW? ,) (MW? yo)))"), }; public void testMultiWordNormalization() { for (Pair<String, String> testCase : multiWordTestCases) { Tree head = readTree(testCase.first()); for (Tree t : head) { if (t.isPrePreTerminal()) tn.normalizeForMultiWord(t, tf); } assertEquals(testCase.second(), head.toString()); } } /** * Read a tree from a PTB-style serialized form in the given string. */ private Tree readTree(String treeRep) { try { return new PennTreeReader(new StringReader(treeRep), tf).readTree(); } catch (IOException e) { return null; } } }