/* Copyright 2003, Carnegie Mellon, All Rights Reserved */ package edu.cmu.minorthird.text; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.SortedSet; import java.util.TreeSet; import junit.framework.TestCase; import junit.framework.TestSuite; import edu.cmu.minorthird.text.mixup.Mixup; import edu.cmu.minorthird.text.mixup.MixupInterpreter; import edu.cmu.minorthird.text.mixup.MixupProgram; /** * * @author William Cohen */ public class TestPackage extends TestSuite{ public static final boolean DEBUG=false; public TestPackage(String name){ super(name); } public static TestSuite suite(){ TestSuite suite=new TestSuite(); suite.addTest(new ToXMLTest("doTest")); suite.addTest(new DiffTest("doTest")); suite.addTest(new TrieTest("doTest")); suite.addTest(new MixupTest("doTest")); suite.addTest(new LabelsTest("doTest")); suite.addTest(new TokenizationTest("doTest")); return suite; } public static class TokenizationTest extends TestCase{ public TokenizationTest(String string){ super(string); } public void doTest(){ // what is this first line for? - frank new TextBaseLoader(); BasicTextBase b=new BasicTextBase(); b.loadDocument("letters","a b c d\ne f g h\ni j k l\nm n o p\nr s t u"); Tokenizer tok=new SplitTokenizer("\n"); TextBaseManager tbman=new TextBaseManager("root",b); MutableTextBase childTB=tbman.retokenize(tok,"root","childtb"); //childTB = b.retokenize(tok); MutableTextLabels lab=new BasicTextLabels(childTB); try{ MixupProgram p= new MixupProgram(new String[]{ "defTokenProp token:first =: [any] ...", "defSpanType first =: [token:first] ...", "defTokenProp token:last =: ... [any]", "defSpanType last =: ... [token:last]"}); MixupInterpreter interp=new MixupInterpreter(p); interp.eval(lab); Iterator<Span> looper=lab.instanceIterator("first"); assertTrue(looper.hasNext()); assertEquals("a b c d",looper.next().asString()); Iterator<Span> looper2=lab.instanceIterator("last"); assertTrue(looper2.hasNext()); assertEquals("r s t u",looper2.next().asString()); }catch(Mixup.ParseException e){ throw new IllegalStateException(e.toString()); } } } public static class LabelsTest extends TestCase{ public LabelsTest(String string){ super(string); } public void doTest(){ BasicTextBase b=new BasicTextBase(); MutableTextLabels lab=new BasicTextLabels(b); b.loadDocument("d1","a b c b d"); try{ MixupProgram p= new MixupProgram(new String[]{ "defSpanProp startsWith:b =: ... ['b' any]...", "defSpanProp startsWith:c =: ... ['c' any]...", "defSpanProp endsWith:b =: ... [any 'b']..."}); MixupInterpreter interp=new MixupInterpreter(p); interp.eval(lab); Iterator<Span> looper=lab.getSpansWithProperty("startsWith"); assertTrue(looper.hasNext()); assertEquals("b c",looper.next().asString()); assertTrue(looper.hasNext()); assertEquals("c b",looper.next().asString()); assertTrue(looper.hasNext()); assertEquals("b d",looper.next().asString()); assertTrue(!looper.hasNext()); }catch(Mixup.ParseException e){ throw new IllegalStateException(e.toString()); } } } public static class ToXMLTest extends TestCase{ private BasicTextBase b; public ToXMLTest(String string){ super(string); b=new BasicTextBase(); b.loadDocument("test","a b c d e f g"); } public void doTest(){ // Test marking up nested spans first MutableTextLabels e1=new BasicTextLabels(b); e1.addToType(testSpan(1,3),"x"); checkXML(e1,"<root>a <x>b c d</x> e f g</root>"); e1.addToType(testSpan(2,1),"y"); checkXML(e1,"<root>a <x>b <y>c</y> d</x> e f g</root>"); // Test marking up multiple non-nested spans next. MutableTextLabels e2=new BasicTextLabels(b); e2.addToType(testSpan(1,2),"x"); e2.addToType(testSpan(4,1),"y"); checkXML(e2,"<root>a <x>b c</x> d <y>e</y> f g</root>"); // Test marking up spans where the one ends on the same token that one begins. MutableTextLabels e3=new BasicTextLabels(b); e3.addToType(testSpan(0,3),"x"); e3.addToType(testSpan(3,1),"y"); checkXML(e3,"<root><x>a b c</x> <y>d</y> e f g</root>"); e3.addToType(testSpan(5,2),"z"); checkXML(e3,"<root><x>a b c</x> <y>d</y> e <z>f g</z></root>"); // Add an overlapping span and check that the system throws an IllegalArgumentException. // This is because overlapping spans cannot be written as straight XML. MutableTextLabels e4=new BasicTextLabels(b); e4.addToType(testSpan(1,3),"x"); checkXML(e4,"<root>a <x>b c d</x> e f g</root>"); e4.addToType(testSpan(2,3),"y"); boolean caughtException=false; try{ new TextLabelsLoader().createXMLmarkup("test",e4); }catch(IllegalArgumentException e){ caughtException=true; } assertEquals(caughtException,true); } private void checkXML(TextLabels e,String expected){ String actual=new TextLabelsLoader().createXMLmarkup("test",e); if(DEBUG){ System.out.println("expected: '"+expected+"'"); System.out.println("actual: '"+actual+"'"); System.out.println(""); } assertEquals(expected,actual); } private Span testSpan(int lo,int len){ return b.documentSpan("test").subSpan(lo,len); } } // // difference-testing code // to do: test on adjacent/overlapping spans // public static class DiffTest extends TestCase{ public DiffTest(String string){ super(string); } public void doTest(){ SortedSet<Span> guess=new TreeSet<Span>(); SortedSet<Span> truth=new TreeSet<Span>(); BasicTextBase b=new BasicTextBase(); TextLabels e=new BasicTextLabels(b); b.loadDocument("a-d","a b c d"); b.loadDocument("e-h","e f g h"); b.loadDocument("i-l","i j k l"); b.loadDocument("m-p","m n o p"); b.loadDocument("r-u","r s t u"); try{ truth.add(new Mixup("'a' ['b' 'c'] 'd'").extract(e, b.documentSpanIterator()).next()); truth.add(new Mixup("'e' 'f' ['g' 'h']").extract(e, b.documentSpanIterator()).next()); truth.add(new Mixup("'i' ['j' 'k'] 'l'").extract(e, b.documentSpanIterator()).next()); truth.add(new Mixup("'m' ['n' 'o'] 'p'").extract(e, b.documentSpanIterator()).next()); guess.add(new Mixup("'e' ['f' 'g'] 'h'").extract(e, b.documentSpanIterator()).next()); guess.add(new Mixup("'i' ['j' 'k'] 'l'").extract(e, b.documentSpanIterator()).next()); guess.add(new Mixup("'m' 'n' ['o' 'p']").extract(e, b.documentSpanIterator()).next()); guess.add(new Mixup("'r' ['s' 't'] 'u'").extract(e, b.documentSpanIterator()).next()); }catch(Mixup.ParseException ex){ ex.printStackTrace(); } SpanDifference sd=new SpanDifference(guess.iterator(),truth.iterator()); DiffExpects[] expects= new DiffExpects[]{new DiffExpects("b c",SpanDifference.FALSE_NEG), new DiffExpects("f",SpanDifference.FALSE_POS), new DiffExpects("g",SpanDifference.TRUE_POS), new DiffExpects("h",SpanDifference.FALSE_NEG), new DiffExpects("j k",SpanDifference.TRUE_POS), new DiffExpects("n",SpanDifference.FALSE_NEG), new DiffExpects("o",SpanDifference.TRUE_POS), new DiffExpects("p",SpanDifference.FALSE_POS), new DiffExpects("s t",SpanDifference.FALSE_POS)}; SpanDifference.Looper d=sd.differenceIterator(); int k=0; while(d.hasNext()){ Span s=d.next(); int stat=d.getStatus(); DiffExpects dx=expects[k++]; assertEquals(dx.s,s.asString()); assertEquals(dx.stat,stat); } } public static class DiffExpects{ public String s; int stat; public DiffExpects(String s,int stat){ this.s=s; this.stat=stat; } } } // // trie-testing code // public static class TrieTest extends TestCase{ private BasicTextBase b=new BasicTextBase(); private Trie trie=new Trie(); public TrieTest(String string){ super(string); } public void doTest(){ trie.addWords("wwc",b.getTokenizer().splitIntoTokens("william cohen")); trie.addWords("wjc",b.getTokenizer().splitIntoTokens("william clinton")); trie.addWords("pc",b.getTokenizer().splitIntoTokens("paul cohen")); trie.addWords("j2p2",b.getTokenizer() .splitIntoTokens("pope john paul II")); trie.addWords("theMan",b.getTokenizer().splitIntoTokens("william cohen")); b.loadDocument("t1","aint william cohen a great guy?"); b .loadDocument("t2", "men of the year: william william cohen ; william clinton ; and - bill gates??"); b .loadDocument( "t3", "cohen & jensen was written by (a) william cohen (b) paul cohen (c) all of the above"); b.loadDocument("t4","is the pope john paul II or not?"); checkLookup("t1",new TrieExpects[]{new TrieExpects(new String[]{"wwc", "theMan"},1,2)}); checkLookup("t2",new TrieExpects[]{ new TrieExpects(new String[]{"wwc","theMan"},6,2), new TrieExpects(new String[]{"wjc"},9,2)}); checkLookup("t3",new TrieExpects[]{ new TrieExpects(new String[]{"wwc","theMan"},9,2), new TrieExpects(new String[]{"pc"},14,2)}); checkLookup("t4",new TrieExpects[]{new TrieExpects(new String[]{"j2p2"}, 2,4)}); } private void checkLookup(String documentId,TrieExpects[] expects){ Span span=b.documentSpan(documentId); List<Span> spanList=new ArrayList<Span>(); List<List<String>> idList=new ArrayList<List<String>>(); if(DEBUG) System.out.println("lookup in "+span); int k=0; for(Trie.ResultIterator i=trie.lookup(span);i.hasNext();){ spanList.add(i.next()); idList.add(i.getAssociatedIds()); k++; if(DEBUG) System.out.println("found "+spanList.get(k-1)+" ids: "+ idList.get(k-1)); } assertEquals(expects.length,spanList.size()); for(int i=0;i<expects.length;i++){ Span s=(Span)spanList.get(i); assertEquals(expects[i].start,s.documentSpanStartIndex()); assertEquals(expects[i].length,s.size()); List<String> ids=idList.get(i); assertEquals(expects[i].ids.length,ids.size()); for(int j=0;j<expects[i].ids.length;j++){ assertTrue(ids.contains(expects[i].ids[j])); } } } public static class TrieExpects{ public String[] ids; int start,length; public TrieExpects(String[] ids,int start,int length){ this.ids=ids; this.start=start; this.length=length; } } } // // mixup-testing code // public static class MixupTest extends TestCase{ private BasicTextBase b=new BasicTextBase(); private TextLabels e=new BasicTextLabels(b); public MixupTest(String string){ super(string); } public void doTest(){ b.loadDocument("test1","aa bb ccc dd ee ff"); // // test basic mixup commands // checkExpr(e,"[any+] any{2,3}",new String[]{"aa bb ccc","aa bb ccc dd"}); checkExpr(e,"... [any 'ccc'?] re('[cdef]')*",new String[]{"bb","bb ccc", "ccc","dd","ee","ff"}); checkExpr(e,"[any] ...",new String[]{"aa"}); checkExpr(e,"... [any]",new String[]{"ff"}); checkExpr(e,"any{1} [any] ...",new String[]{"bb"}); checkExpr(e,"any{2,3} [any] ...",new String[]{"ccc","dd"}); checkExpr(e,"any{,2} [any] ...",new String[]{"aa","bb","ccc"}); checkExpr(e,"any{3,} [any] ...",new String[]{"dd","ee","ff"}); checkExpr(e,"[any{2,3}] ...",new String[]{"aa bb","aa bb ccc"}); checkExpr(e,"... 'bb' [any] ...",new String[]{"ccc"}); checkExpr(e,"... !'bb' [!'bb'] ...",new String[]{"dd","ee","ff"}); checkExpr(e,"... [re('...') any] ...",new String[]{"ccc dd"}); checkExpr(e,"... [!re('^..$') any] ...",new String[]{"ccc dd"}); checkExpr(e,"... [eq('ccc') any] ...",new String[]{"ccc dd"}); checkExpr(e,"... [any any] ... && [re('[bcd]') any]",new String[]{ "bb ccc","ccc dd","dd ee"}); checkExpr(e,"... [re('[bc]')] ... || ... [re('[cd]')] ...",new String[]{ "bb","ccc","dd"}); checkExpr(e, "(... [re('[bc]')] ... || ... [re('[cd]')] ...) && [re('...')]", new String[]{"ccc"}); checkExpr(e,"...[re('...')]... && ( [re('[bc]')] || [re('[cd]')] ) ", new String[]{"ccc"}); checkExpr(e,"...[L re('^..$')+ R]...",new String[]{"aa bb","dd ee ff"}); checkExpr(e,"...<re('[bc]'),re('...')>[any]...",new String[]{"dd"}); // // test program - out is the output tested against the 'expected' strings // checkProg(new String[]{ "defTokenProp trigram:t =: ... [re('^...$')] ... ", "defTokenProp bigram:t =: ... [re('^..$')] ... ", "defSpanType out =: ... [trigram:t any] ..."},new String[]{"ccc dd"}); checkProg(new String[]{ "defTokenProp trigram:t =: ... [re('^...$')*] ...", "defTokenProp bigram:t =: ... [re('^..$')*] ...", "defSpanType out =: ... [trigram:t any] ..."},new String[]{"ccc dd"}); checkProg(new String[]{ "defTokenProp trigram:t =: ... [re('^...$')] ... ", "defTokenProp bigram:t =: ... [re('^..$')] ... ", "defSpanType trispan =: ... [trigram:t] ...", "defSpanType bispan=: ... [bigram:t bigram:t] ...", "defSpanType out =: ... [ @bispan @trispan ] ..."}, new String[]{"aa bb ccc"}); checkProg(new String[]{ "defTokenProp trigram:t =: ... [re('^...$')] ... ", "defTokenProp bigram:t =: ... [re('^..$')] ... ", "defSpanType trispan =: ... [trigram:t] ...", "defSpanType bispan=: ... [bigram:t bigram:t] ...", "defSpanType out =: ... [ @bispan? @trispan ] ..."},new String[]{ "aa bb ccc","ccc"}); checkProg(new String[]{"defSpanType out =~ re 'b+ (c+)', 1'"}, new String[]{"ccc"}); checkProg(new String[]{"defSpanType out =~ re 'b+ (c+\\s+)', 1'"}, new String[]{"ccc"}); checkProg(new String[]{"defSpanType pair =: ... [any any] ...", "defSpanType out =pair- ... ['ccc'] ... "},new String[]{"aa bb", "dd ee","ee ff"}); checkProg(new String[]{"defSpanType out =~ trie aa bb,bb ccc,bb ccc dd"}, new String[]{"aa bb","bb ccc","bb ccc dd"}); // // test dictionaries and multiple documents // MutableTextLabels numLabels=new BasicTextLabels(b); String[] nums=new String[]{"one","two","three","four","five"}; SortedSet<String> numSet=new TreeSet<String>(); for(int i=0;i<nums.length;i++) numSet.add(nums[i]); numLabels.defineDictionary("num",numSet); b.loadDocument("test2","one fish, two fish"); b.loadDocument("test3","red fish, blue fish"); b.loadDocument("test4","one, two, three strikes you're out"); b.loadDocument("test5","Three phish"); try{ Mixup numExpr=new Mixup("... [a(num) <!a(num),re('[a-z]')>] ..."); checkLooper(new String[]{"one fish","two fish","three strikes"},numExpr .extract(numLabels,b.documentSpanIterator())); new BoneheadStemmer().stem(b,numLabels); Mixup stemExpr=new Mixup(" ... [stem:a(num) stem:strik] ... "); checkLooper(new String[]{"three strikes"},stemExpr.extract(numLabels,b .documentSpanIterator())); Mixup aiExpr=new Mixup("[ai(num) any]"); checkLooper(new String[]{"Three phish"},aiExpr.extract(numLabels,b .documentSpanIterator())); }catch(Mixup.ParseException e){ throw new IllegalStateException("parse error "+e); } } private void checkProg(String[] statements,String[] expected){ try{ MixupProgram program=new MixupProgram(statements); MutableTextLabels labels=new BasicTextLabels(b); if(DEBUG) System.out.println("checking program "+program); MixupInterpreter interp=new MixupInterpreter(program); interp.eval(labels); checkLooper(expected,labels.instanceIterator("out")); }catch(Mixup.ParseException e){ throw new IllegalStateException("parse error"+e); } } private void checkExpr(TextLabels e,String pattern,String[] expected){ if(DEBUG) System.out.println("checking "+pattern); try{ Mixup mixup=new Mixup(pattern); checkLooper(expected,mixup.extract(e,b.documentSpanIterator())); }catch(Mixup.ParseException ex){ throw new IllegalStateException("parse error"+ex); } } private void checkLooper(String[] expected,Iterator<Span> looper){ List<Span> list=new ArrayList<Span>(); while(looper.hasNext()){ Span s=looper.next(); if(DEBUG) System.out.println(" - result '"+s+"'"); list.add(s); } assertEquals(expected.length,list.size()); for(int i=0;i<list.size();i++){ String[] toks=b.getTokenizer().splitIntoTokens(expected[i]); Span span=(Span)list.get(i); assertEquals(toks.length,span.size()); if(DEBUG) System.out.print("checking '"+span.toString()+"' vs expected '"+ expected[i]+"'"); for(int j=0;j<span.size();j++){ assertEquals(toks[j],span.getToken(j).getValue()); } if(DEBUG) System.out.println("- passed."); } } } static public void main(String[] argv){ junit.textui.TestRunner.run(suite()); } }