package edu.cmu.minorthird.text; import junit.framework.TestCase; import junit.framework.TestSuite; import junit.framework.Test; import org.apache.log4j.Logger; /** * * @author Quinten Mercer */ public class TextBaseManagerTest extends TestCase { Logger log = Logger.getLogger(this.getClass()); /** Sample documents to use for the test. */ public static String[] sampleDocs = new String[]{ "Mary had a little lamb. Its fleece was white as snow.", "Eeny, meeny, miny, moe. Catch a tiger by the toe.", "Row, row, row your boat gently down the stream. Merrily, merrily, merrily, life is but a dream.", "Mary, Mary quite contrary, how does your garden grow?"}; /** * Standard test class constructior for TextBaseTests * @param name Name of the test */ public TextBaseManagerTest(String name) { super(name); } /** Convinence constructior for TextBaseTests */ public TextBaseManagerTest() { super("TextBaseManagerTest"); } /** Set up steps to run before each test */ protected void setUp() { Logger.getRootLogger().removeAllAppenders(); org.apache.log4j.BasicConfigurator.configure(); //TODO add initializations if needed } /** clean up steps to run after each test */ protected void tearDown() { //TODO clean up resources if needed } // // the Tests // public void testRetokenize() { // Load some sample docs into a textbase BasicTextBase parentTextBase = new BasicTextBase(); parentTextBase.loadDocument("doc0", sampleDocs[0]); parentTextBase.loadDocument("doc1", sampleDocs[1]); parentTextBase.loadDocument("doc2", sampleDocs[2]); parentTextBase.loadDocument("doc3", sampleDocs[3]); // Now create a labels set for this text base and add some annotations BasicTextLabels labels = new BasicTextLabels(parentTextBase); BasicSpan span1 = new BasicSpan("doc0", parentTextBase.getDocument("doc0").getTokens(), 1, 4, "doc0"); labels.addToType(span1, "predicate"); BasicSpan span2 = new BasicSpan("doc0", parentTextBase.getDocument("doc0").getTokens(), 8, 4, "doc0"); labels.addToType(span2, "predicate"); BasicSpan span3 = new BasicSpan("doc1", parentTextBase.getDocument("doc1").getTokens(), 11, 3, "doc1"); labels.addToType(span3, "predicate"); BasicSpan span4 = new BasicSpan("doc2", parentTextBase.getDocument("doc2").getTokens(), 8, 3, "doc2"); labels.addToType(span4, "predicate"); BasicSpan span5 = new BasicSpan("doc2", parentTextBase.getDocument("doc2").getTokens(), 19, 4, "doc2"); labels.addToType(span5, "predicate"); BasicSpan span6 = new BasicSpan("doc3", parentTextBase.getDocument("doc3").getTokens(), 6, 5, "doc3"); labels.addToType(span6, "predicate"); // Create a TextBaseManager to manage the different levels TextBaseManager tbman = new TextBaseManager("root", parentTextBase); // create a new tokenizer RegexTokenizer newTokenizer = new RegexTokenizer("([^\\s]+)"); // call retokenize with this new stuff. MutableTextBase newTextBase = tbman.retokenize(newTokenizer, "root", "newLevel"); // Check that the TextBaseManager stored the correct new textbase under the correct level name TextBase tb = tbman.getTextBase("newLevel"); assertEquals(newTextBase, tb); // Check that there are the correct number of documents in the new text base assertEquals(parentTextBase.size(), newTextBase.size()); // Check that the documents in the new text base have the correct number of tokens assertEquals(11, newTextBase.documentSpan("doc0").size()); assertEquals(10, newTextBase.documentSpan("doc1").size()); assertEquals(17, newTextBase.documentSpan("doc2").size()); assertEquals(9, newTextBase.documentSpan("doc3").size()); // check that the textbase has the appropriate tokenizer for new docs assertEquals(newTokenizer, newTextBase.getTokenizer()); // Test mapping from the root level to the new level. Since the tokenizer we used split // tokens based on whitespace, the mapped spans will NOT have the exact same characters // as the originals. If there were non-white space characters (such as punctuation) that // are next to the first token of the span (imeediately to the left) or the last token // of the span (immediately to the right), then the tokens that contain these chars will // be included in the mapped span. This is because the new tokenizer will treat all of // these chars as a single token and since Spans are based on tokens and not chars, these // extra chars get added. // Start by mapping actual Span instances Span mappedSpan = tbman.getMatchingSpan(span1, "root", "newLevel"); assertEquals("had a little lamb.", mappedSpan.asString()); mappedSpan = tbman.getMatchingSpan(span2, "root", "newLevel"); assertEquals("was white as snow.", mappedSpan.asString()); mappedSpan = tbman.getMatchingSpan(span3, "root", "newLevel"); assertEquals("by the toe.", mappedSpan.asString()); mappedSpan = tbman.getMatchingSpan(span4, "root", "newLevel"); assertEquals("down the stream.", mappedSpan.asString()); mappedSpan = tbman.getMatchingSpan(span5, "root", "newLevel"); assertEquals("is but a dream.", mappedSpan.asString()); mappedSpan = tbman.getMatchingSpan(span6, "root", "newLevel"); assertEquals("how does your garden grow?", mappedSpan.asString()); // Now map some random char offsets to make sure that they get mapped to corresponding spans correctly mappedSpan = tbman.getMatchingSpan("root", "doc0", 13, 20, "newLevel"); assertEquals("little lamb. Its fleece", mappedSpan.asString()); mappedSpan = tbman.getMatchingSpan("root", "doc1", 7, 20, "newLevel"); assertEquals("meeny, miny, moe. Catch", mappedSpan.asString()); mappedSpan = tbman.getMatchingSpan("root", "doc2", 26, 27, "newLevel"); assertEquals("gently down the stream. Merrily,", mappedSpan.asString()); mappedSpan = tbman.getMatchingSpan("root", "doc3", 20, 24, "newLevel"); assertEquals("contrary, how does your garden", mappedSpan.asString()); } public void testFilter() { // Create a text base to house the sample docs BasicTextBase parentTextBase = new BasicTextBase(); parentTextBase.loadDocument("doc0", sampleDocs[0]); parentTextBase.loadDocument("doc1", sampleDocs[1]); parentTextBase.loadDocument("doc2", sampleDocs[2]); parentTextBase.loadDocument("doc3", sampleDocs[3]); // Now create a labels set for this text base and add some annotations BasicTextLabels labels = new BasicTextLabels(parentTextBase); BasicSpan span1 = new BasicSpan("doc0", parentTextBase.getDocument("doc0").getTokens(), 1, 4, "doc0"); labels.addToType(span1, "predicate"); BasicSpan span2 = new BasicSpan("doc0", parentTextBase.getDocument("doc0").getTokens(), 8, 4, "doc0"); labels.addToType(span2, "predicate"); BasicSpan span3 = new BasicSpan("doc1", parentTextBase.getDocument("doc1").getTokens(), 11, 3, "doc1"); labels.addToType(span3, "predicate"); BasicSpan span4 = new BasicSpan("doc2", parentTextBase.getDocument("doc2").getTokens(), 8, 3, "doc2"); labels.addToType(span4, "predicate"); BasicSpan span5 = new BasicSpan("doc2", parentTextBase.getDocument("doc2").getTokens(), 19, 4, "doc2"); labels.addToType(span5, "predicate"); BasicSpan span6 = new BasicSpan("doc3", parentTextBase.getDocument("doc3").getTokens(), 6, 5, "doc3"); labels.addToType(span6, "predicate"); // Create a TextBaseManager to manage the different levels TextBaseManager tbman = new TextBaseManager("root", parentTextBase); // call filter with this new stuff. TextBase newTextBase = tbman.filter("root", labels, "newLevel", "predicate"); // Check that the TextBaseManager stored the correct new textbase under the correct level name TextBase tb = tbman.getTextBase("newLevel"); assertEquals(newTextBase, tb); // Check that there are the correct number of documents in the new text base. In this case // since we filtered on the "predicate" type there should be one doc in the new text base // for each instance on this span type in the original text base. assertEquals(6, newTextBase.size()); // Check that the documents in the new text base have the correct number of tokens assertEquals(4, newTextBase.documentSpan("childTB0-doc0").size()); assertEquals(4, newTextBase.documentSpan("childTB1-doc0").size()); assertEquals(3, newTextBase.documentSpan("childTB0-doc1").size()); assertEquals(3, newTextBase.documentSpan("childTB0-doc2").size()); assertEquals(4, newTextBase.documentSpan("childTB1-doc2").size()); assertEquals(5, newTextBase.documentSpan("childTB0-doc3").size()); // Test mapping from the root level to the new level. Since the tokenizer we used split // tokens was the same as the original text base, the mapped spans should have the EXACT // same characters as the originals. // Start by mapping actual Span instances Span mappedSpan = tbman.getMatchingSpan(span1, "root", "newLevel"); assertEquals("had a little lamb", mappedSpan.asString()); mappedSpan = tbman.getMatchingSpan(span2, "root", "newLevel"); assertEquals("was white as snow", mappedSpan.asString()); mappedSpan = tbman.getMatchingSpan(span3, "root", "newLevel"); assertEquals("by the toe", mappedSpan.asString()); mappedSpan = tbman.getMatchingSpan(span4, "root", "newLevel"); assertEquals("down the stream", mappedSpan.asString()); mappedSpan = tbman.getMatchingSpan(span5, "root", "newLevel"); assertEquals("is but a dream", mappedSpan.asString()); mappedSpan = tbman.getMatchingSpan(span6, "root", "newLevel"); assertEquals("how does your garden grow", mappedSpan.asString()); // Now map some random char offsets to make sure that they get mapped to corresponding spans correctly mappedSpan = tbman.getMatchingSpan("root", "doc0", 13, 8, "newLevel"); assertEquals("little lamb", mappedSpan.asString()); mappedSpan = tbman.getMatchingSpan("root", "doc3", 26, 18, "newLevel"); assertEquals("how does your garden", mappedSpan.asString()); // There should be no matching spans for these because the char sequences I've specified // were not part of an instance of the span type I filtered the text base on. mappedSpan = tbman.getMatchingSpan("root", "doc1", 7, 13, "newLevel"); assertNull(mappedSpan); mappedSpan = tbman.getMatchingSpan("root", "doc2", 26, 18, "newLevel"); assertNull(mappedSpan); } public void testMultiLevel() { // Load some sample docs into a textbase BasicTextBase parentTextBase = new BasicTextBase(); parentTextBase.loadDocument("doc0", sampleDocs[0]); parentTextBase.loadDocument("doc1", sampleDocs[1]); parentTextBase.loadDocument("doc2", sampleDocs[2]); parentTextBase.loadDocument("doc3", sampleDocs[3]); // Now create a labels set for this text base and add some annotations BasicTextLabels labels = new BasicTextLabels(parentTextBase); BasicSpan span1 = new BasicSpan("doc0", parentTextBase.getDocument("doc0").getTokens(), 1, 4, "doc0"); labels.addToType(span1, "predicate"); BasicSpan span2 = new BasicSpan("doc0", parentTextBase.getDocument("doc0").getTokens(), 8, 4, "doc0"); labels.addToType(span2, "predicate"); BasicSpan span3 = new BasicSpan("doc1", parentTextBase.getDocument("doc1").getTokens(), 11, 3, "doc1"); labels.addToType(span3, "predicate"); BasicSpan span4 = new BasicSpan("doc2", parentTextBase.getDocument("doc2").getTokens(), 8, 3, "doc2"); labels.addToType(span4, "predicate"); BasicSpan span5 = new BasicSpan("doc2", parentTextBase.getDocument("doc2").getTokens(), 19, 4, "doc2"); labels.addToType(span5, "predicate"); BasicSpan span6 = new BasicSpan("doc3", parentTextBase.getDocument("doc3").getTokens(), 6, 5, "doc3"); labels.addToType(span6, "predicate"); // Create a TextBaseManager to manage the different levels TextBaseManager tbman = new TextBaseManager("root", parentTextBase); // create a new tokenizer RegexTokenizer newTokenizer = new RegexTokenizer("([^\\s]+)"); // call retokenize with this new stuff. tbman.retokenize(newTokenizer, "root", "retok"); // call filter with this new stuff. TextBase filteredTextBase = tbman.filter("retok", labels, "filtered", "predicate"); // Check that there are the correct number of documents in the new text base. In this case // since we filtered on the "predicate" type there should be one doc in the new text base // for each instance on this span type in the original text base. assertEquals(6, filteredTextBase.size()); // Check that the documents in the new text base have the correct number of tokens assertEquals(4, filteredTextBase.documentSpan("childTB0-doc0").size()); assertEquals(4, filteredTextBase.documentSpan("childTB1-doc0").size()); assertEquals(3, filteredTextBase.documentSpan("childTB0-doc1").size()); assertEquals(3, filteredTextBase.documentSpan("childTB0-doc2").size()); assertEquals(4, filteredTextBase.documentSpan("childTB1-doc2").size()); assertEquals(5, filteredTextBase.documentSpan("childTB0-doc3").size()); // check that the docs have the appropriate tokenization assertEquals(4, filteredTextBase.getDocument("childTB0-doc0").getTokens().length); assertEquals(4, filteredTextBase.getDocument("childTB1-doc0").getTokens().length); assertEquals(3, filteredTextBase.getDocument("childTB0-doc1").getTokens().length); assertEquals(3, filteredTextBase.getDocument("childTB0-doc2").getTokens().length); assertEquals(4, filteredTextBase.getDocument("childTB1-doc2").getTokens().length); assertEquals(5, filteredTextBase.getDocument("childTB0-doc3").getTokens().length); } /** * Creates a TestSuite from all testXXX methods * @return TestSuite */ public static Test suite() { return new TestSuite(TextBaseManagerTest.class); } /** * Run the full suite of tests with text output * @param args - unused */ public static void main(String args[]) { junit.textui.TestRunner.run(suite()); } }