TextBaseManagerTest.java example

Explorer
MinorThird-master
package edu.cmu.minorthird.text;

import junit.framework.TestCase;
import junit.framework.TestSuite;
import junit.framework.Test;
import org.apache.log4j.Logger;

/**
 *
 * @author Quinten Mercer
 */
public class TextBaseManagerTest extends TestCase
{
    Logger log = Logger.getLogger(this.getClass());

    /** Sample documents to use for the test. */
    public static String[] sampleDocs = new String[]{ "Mary had a little lamb.  Its fleece was white as snow.",
                                                      "Eeny, meeny, miny, moe.  Catch a tiger by the toe.",
                                                      "Row, row, row your boat gently down the stream.  Merrily, merrily, merrily, life is but a dream.",
                                                      "Mary, Mary quite contrary, how does your garden grow?"};

    /**
     * Standard test class constructior for TextBaseTests
     * @param name Name of the test
     */
    public TextBaseManagerTest(String name) { super(name); }

    /** Convinence constructior for TextBaseTests */
    public TextBaseManagerTest() { super("TextBaseManagerTest"); }

    /** Set up steps to run before each test */
    protected void setUp() {
        Logger.getRootLogger().removeAllAppenders();
        org.apache.log4j.BasicConfigurator.configure();
        //TODO add initializations if needed
    }

    /** clean up steps to run after each test */
    protected void tearDown() {
        //TODO clean up resources if needed
    }


    //
    // the Tests
    //

    public void testRetokenize() {

        // Load some sample docs into a textbase
        BasicTextBase parentTextBase = new BasicTextBase();
        parentTextBase.loadDocument("doc0", sampleDocs[0]);
        parentTextBase.loadDocument("doc1", sampleDocs[1]);
        parentTextBase.loadDocument("doc2", sampleDocs[2]);
        parentTextBase.loadDocument("doc3", sampleDocs[3]);

        // Now create a labels set for this text base and add some annotations
        BasicTextLabels labels = new BasicTextLabels(parentTextBase);
        BasicSpan span1 = new BasicSpan("doc0", parentTextBase.getDocument("doc0").getTokens(), 1, 4, "doc0"); 
        labels.addToType(span1, "predicate");
        BasicSpan span2 = new BasicSpan("doc0", parentTextBase.getDocument("doc0").getTokens(), 8, 4, "doc0"); 
        labels.addToType(span2, "predicate");
        BasicSpan span3 = new BasicSpan("doc1", parentTextBase.getDocument("doc1").getTokens(), 11, 3, "doc1"); 
        labels.addToType(span3, "predicate");
        BasicSpan span4 = new BasicSpan("doc2", parentTextBase.getDocument("doc2").getTokens(), 8, 3, "doc2"); 
        labels.addToType(span4, "predicate");
        BasicSpan span5 = new BasicSpan("doc2", parentTextBase.getDocument("doc2").getTokens(), 19, 4, "doc2"); 
        labels.addToType(span5, "predicate");
        BasicSpan span6 = new BasicSpan("doc3", parentTextBase.getDocument("doc3").getTokens(), 6, 5, "doc3"); 
        labels.addToType(span6, "predicate");

        // Create a TextBaseManager to manage the different levels
        TextBaseManager tbman = new TextBaseManager("root", parentTextBase);

        // create a new tokenizer
        RegexTokenizer newTokenizer = new RegexTokenizer("([^\\s]+)");

        // call retokenize with this new stuff.
        MutableTextBase newTextBase = tbman.retokenize(newTokenizer, "root", "newLevel");

        // Check that the TextBaseManager stored the correct new textbase under the correct level name
        TextBase tb = tbman.getTextBase("newLevel");
        assertEquals(newTextBase, tb);

        // Check that there are the correct number of documents in the new text base
        assertEquals(parentTextBase.size(), newTextBase.size());        

        // Check that the documents in the new text base have the correct number of tokens
        assertEquals(11, newTextBase.documentSpan("doc0").size());
        assertEquals(10, newTextBase.documentSpan("doc1").size());
        assertEquals(17, newTextBase.documentSpan("doc2").size());
        assertEquals(9, newTextBase.documentSpan("doc3").size());

        // check that the textbase has the appropriate tokenizer for new docs
        assertEquals(newTokenizer, newTextBase.getTokenizer());

        // Test mapping from the root level to the new level.  Since the tokenizer we used split 
        //   tokens based on whitespace, the mapped spans will NOT have the exact same characters 
        //   as the originals.  If there were non-white space characters (such as punctuation) that 
        //   are next to the first token of the span (imeediately to the left) or the last token 
        //   of the span (immediately to the right), then the tokens that contain these chars will 
        //   be included in the mapped span.  This is because the new tokenizer will treat all of 
        //   these chars as a single token and since Spans are based on tokens and not chars, these 
        //   extra chars get added.

        // Start by mapping actual Span instances
        Span mappedSpan = tbman.getMatchingSpan(span1, "root", "newLevel");
        assertEquals("had a little lamb.", mappedSpan.asString());
        mappedSpan = tbman.getMatchingSpan(span2, "root", "newLevel");
        assertEquals("was white as snow.", mappedSpan.asString());
        mappedSpan = tbman.getMatchingSpan(span3, "root", "newLevel");
        assertEquals("by the toe.", mappedSpan.asString());
        mappedSpan = tbman.getMatchingSpan(span4, "root", "newLevel");
        assertEquals("down the stream.", mappedSpan.asString());
        mappedSpan = tbman.getMatchingSpan(span5, "root", "newLevel");
        assertEquals("is but a dream.", mappedSpan.asString());
        mappedSpan = tbman.getMatchingSpan(span6, "root", "newLevel");
        assertEquals("how does your garden grow?", mappedSpan.asString());

        // Now map some random char offsets to make sure that they get mapped to corresponding spans correctly
        mappedSpan = tbman.getMatchingSpan("root", "doc0", 13, 20, "newLevel");
        assertEquals("little lamb.  Its fleece", mappedSpan.asString());
        mappedSpan = tbman.getMatchingSpan("root", "doc1", 7, 20, "newLevel");
        assertEquals("meeny, miny, moe.  Catch", mappedSpan.asString());
        mappedSpan = tbman.getMatchingSpan("root", "doc2", 26, 27, "newLevel");
        assertEquals("gently down the stream.  Merrily,", mappedSpan.asString());
        mappedSpan = tbman.getMatchingSpan("root", "doc3", 20, 24, "newLevel");
        assertEquals("contrary, how does your garden", mappedSpan.asString());

    }

    public void testFilter() {

        // Create a text base to house the sample docs
        BasicTextBase parentTextBase = new BasicTextBase();
        parentTextBase.loadDocument("doc0", sampleDocs[0]);
        parentTextBase.loadDocument("doc1", sampleDocs[1]);
        parentTextBase.loadDocument("doc2", sampleDocs[2]);
        parentTextBase.loadDocument("doc3", sampleDocs[3]);

        // Now create a labels set for this text base and add some annotations
        BasicTextLabels labels = new BasicTextLabels(parentTextBase);
        BasicSpan span1 = new BasicSpan("doc0", parentTextBase.getDocument("doc0").getTokens(), 1, 4, "doc0");
        labels.addToType(span1, "predicate");
        BasicSpan span2 = new BasicSpan("doc0", parentTextBase.getDocument("doc0").getTokens(), 8, 4, "doc0");
        labels.addToType(span2, "predicate");
        BasicSpan span3 = new BasicSpan("doc1", parentTextBase.getDocument("doc1").getTokens(), 11, 3, "doc1");
        labels.addToType(span3, "predicate");
        BasicSpan span4 = new BasicSpan("doc2", parentTextBase.getDocument("doc2").getTokens(), 8, 3, "doc2");
        labels.addToType(span4, "predicate");
        BasicSpan span5 = new BasicSpan("doc2", parentTextBase.getDocument("doc2").getTokens(), 19, 4, "doc2");
        labels.addToType(span5, "predicate");
        BasicSpan span6 = new BasicSpan("doc3", parentTextBase.getDocument("doc3").getTokens(), 6, 5, "doc3");
        labels.addToType(span6, "predicate");

        // Create a TextBaseManager to manage the different levels
        TextBaseManager tbman = new TextBaseManager("root", parentTextBase);

        // call filter with this new stuff.
        TextBase newTextBase = tbman.filter("root", labels, "newLevel", "predicate");

        // Check that the TextBaseManager stored the correct new textbase under the correct level name
        TextBase tb = tbman.getTextBase("newLevel");
        assertEquals(newTextBase, tb);

        // Check that there are the correct number of documents in the new text base.  In this case
        //   since we filtered on the "predicate" type there should be one doc in the new text base
        //   for each instance on this span type in the original text base.
        assertEquals(6, newTextBase.size());

        // Check that the documents in the new text base have the correct number of tokens
        assertEquals(4, newTextBase.documentSpan("childTB0-doc0").size());
        assertEquals(4, newTextBase.documentSpan("childTB1-doc0").size());
        assertEquals(3, newTextBase.documentSpan("childTB0-doc1").size());
        assertEquals(3, newTextBase.documentSpan("childTB0-doc2").size());
        assertEquals(4, newTextBase.documentSpan("childTB1-doc2").size());
        assertEquals(5, newTextBase.documentSpan("childTB0-doc3").size());

        // Test mapping from the root level to the new level.  Since the tokenizer we used split
        //   tokens was the same as the original text base, the mapped spans should have the EXACT
        //   same characters as the originals.

        // Start by mapping actual Span instances
        Span mappedSpan = tbman.getMatchingSpan(span1, "root", "newLevel");
        assertEquals("had a little lamb", mappedSpan.asString());
        mappedSpan = tbman.getMatchingSpan(span2, "root", "newLevel");
        assertEquals("was white as snow", mappedSpan.asString());
        mappedSpan = tbman.getMatchingSpan(span3, "root", "newLevel");
        assertEquals("by the toe", mappedSpan.asString());
        mappedSpan = tbman.getMatchingSpan(span4, "root", "newLevel");
        assertEquals("down the stream", mappedSpan.asString());
        mappedSpan = tbman.getMatchingSpan(span5, "root", "newLevel");
        assertEquals("is but a dream", mappedSpan.asString());
        mappedSpan = tbman.getMatchingSpan(span6, "root", "newLevel");
        assertEquals("how does your garden grow", mappedSpan.asString());
        
        // Now map some random char offsets to make sure that they get mapped to corresponding spans correctly
        mappedSpan = tbman.getMatchingSpan("root", "doc0", 13, 8, "newLevel");
        assertEquals("little lamb", mappedSpan.asString());
        mappedSpan = tbman.getMatchingSpan("root", "doc3", 26, 18, "newLevel");
        assertEquals("how does your garden", mappedSpan.asString());

        // There should be no matching spans for these because the char sequences I've specified
        // were not part of an instance of the span type I filtered the text base on.
        mappedSpan = tbman.getMatchingSpan("root", "doc1", 7, 13, "newLevel");
        assertNull(mappedSpan);
        mappedSpan = tbman.getMatchingSpan("root", "doc2", 26, 18, "newLevel");
        assertNull(mappedSpan);
    }

    public void testMultiLevel() {
        // Load some sample docs into a textbase
        BasicTextBase parentTextBase = new BasicTextBase();
        parentTextBase.loadDocument("doc0", sampleDocs[0]);
        parentTextBase.loadDocument("doc1", sampleDocs[1]);
        parentTextBase.loadDocument("doc2", sampleDocs[2]);
        parentTextBase.loadDocument("doc3", sampleDocs[3]);

        // Now create a labels set for this text base and add some annotations
        BasicTextLabels labels = new BasicTextLabels(parentTextBase);
        BasicSpan span1 = new BasicSpan("doc0", parentTextBase.getDocument("doc0").getTokens(), 1, 4, "doc0");
        labels.addToType(span1, "predicate");
        BasicSpan span2 = new BasicSpan("doc0", parentTextBase.getDocument("doc0").getTokens(), 8, 4, "doc0");
        labels.addToType(span2, "predicate");
        BasicSpan span3 = new BasicSpan("doc1", parentTextBase.getDocument("doc1").getTokens(), 11, 3, "doc1");
        labels.addToType(span3, "predicate");
        BasicSpan span4 = new BasicSpan("doc2", parentTextBase.getDocument("doc2").getTokens(), 8, 3, "doc2");
        labels.addToType(span4, "predicate");
        BasicSpan span5 = new BasicSpan("doc2", parentTextBase.getDocument("doc2").getTokens(), 19, 4, "doc2");
        labels.addToType(span5, "predicate");
        BasicSpan span6 = new BasicSpan("doc3", parentTextBase.getDocument("doc3").getTokens(), 6, 5, "doc3");
        labels.addToType(span6, "predicate");

        // Create a TextBaseManager to manage the different levels
        TextBaseManager tbman = new TextBaseManager("root", parentTextBase);

        // create a new tokenizer
        RegexTokenizer newTokenizer = new RegexTokenizer("([^\\s]+)");
        // call retokenize with this new stuff.
        tbman.retokenize(newTokenizer, "root", "retok");
        // call filter with this new stuff.
        TextBase filteredTextBase = tbman.filter("retok", labels, "filtered", "predicate");


        // Check that there are the correct number of documents in the new text base.  In this case
        //   since we filtered on the "predicate" type there should be one doc in the new text base
        //   for each instance on this span type in the original text base.
        assertEquals(6, filteredTextBase.size());

        // Check that the documents in the new text base have the correct number of tokens
        assertEquals(4, filteredTextBase.documentSpan("childTB0-doc0").size());
        assertEquals(4, filteredTextBase.documentSpan("childTB1-doc0").size());
        assertEquals(3, filteredTextBase.documentSpan("childTB0-doc1").size());
        assertEquals(3, filteredTextBase.documentSpan("childTB0-doc2").size());
        assertEquals(4, filteredTextBase.documentSpan("childTB1-doc2").size());
        assertEquals(5, filteredTextBase.documentSpan("childTB0-doc3").size());

        // check that the docs have the appropriate tokenization
        assertEquals(4, filteredTextBase.getDocument("childTB0-doc0").getTokens().length);
        assertEquals(4, filteredTextBase.getDocument("childTB1-doc0").getTokens().length);
        assertEquals(3, filteredTextBase.getDocument("childTB0-doc1").getTokens().length);
        assertEquals(3, filteredTextBase.getDocument("childTB0-doc2").getTokens().length);
        assertEquals(4, filteredTextBase.getDocument("childTB1-doc2").getTokens().length);
        assertEquals(5, filteredTextBase.getDocument("childTB0-doc3").getTokens().length);

    }


    /**
     * Creates a TestSuite from all testXXX methods
     * @return TestSuite
     */
    public static Test suite() { return new TestSuite(TextBaseManagerTest.class); }

    /**
     * Run the full suite of tests with text output
     * @param args - unused
     */
    public static void main(String args[]) {
        junit.textui.TestRunner.run(suite());
    }
}