package edu.cmu.minorthird.text;
import junit.framework.TestCase;
import junit.framework.TestSuite;
import junit.framework.Test;
import org.apache.log4j.Logger;
/**
*
* @author Quinten Mercer
*/
public class TokenizerTest extends TestCase
{
Logger log = Logger.getLogger(this.getClass());
/** Sample documents to use for the test. */
public static String[] sampleDocs = new String[]{ "Mary had a little lamb. Its fleece was white as snow.",
"Eeny, meeny, miny, moe. Catch a tiger by the toe.",
"Row, row, row your boat gently down the stream. Merrily, merrily, merrily, life is but a dream.",
"Mary, Mary quite contrary, how does your garden grow?"};
/**
* Standard test class constructior for TextBaseTests
* @param name Name of the test
*/
public TokenizerTest(String name) { super(name); }
/** Convinence constructior for TextBaseTests */
public TokenizerTest() { super("TokenizerTest"); }
/** Set up steps to run before each test */
protected void setUp() {
Logger.getRootLogger().removeAllAppenders();
org.apache.log4j.BasicConfigurator.configure();
//TODO add initializations if needed
}
/** clean up steps to run after each test */
protected void tearDown() {
//TODO clean up resources if needed
}
//
// the Tests
//
public void testRegexTokenizer() {
System.out.println("Testing the RegexTokenizer");
//
// First test it with the default pattern
//
RegexTokenizer tokenizer = new RegexTokenizer();
// Test splitting each sample as a string
String[] tokens1 = tokenizer.splitIntoTokens(sampleDocs[0]);
assertEquals(13, tokens1.length);
tokens1 = tokenizer.splitIntoTokens(sampleDocs[1]);
assertEquals(15, tokens1.length);
tokens1 = tokenizer.splitIntoTokens(sampleDocs[2]);
assertEquals(24, tokens1.length);
tokens1 = tokenizer.splitIntoTokens(sampleDocs[3]);
assertEquals(12, tokens1.length);
// Then put each sample into a doc and test again
Document doc = new Document("doc0", sampleDocs[0]);
TextToken[] tokens2 = tokenizer.splitIntoTokens(doc);
assertEquals(13, tokens2.length);
doc = new Document("doc1", sampleDocs[1]);
tokens2 = tokenizer.splitIntoTokens(doc);
assertEquals(15, tokens2.length);
doc = new Document("doc2", sampleDocs[2]);
tokens2 = tokenizer.splitIntoTokens(doc);
assertEquals(24, tokens2.length);
doc = new Document("doc3", sampleDocs[3]);
tokens2 = tokenizer.splitIntoTokens(doc);
assertEquals(12, tokens2.length);
//
// Then test it with a custom pattern
//
String newPattern = "([^\\s]+)";
tokenizer = new RegexTokenizer(newPattern);
// Test splitting each sample as a string
tokens1 = tokenizer.splitIntoTokens(sampleDocs[0]);
assertEquals(11, tokens1.length);
tokens1 = tokenizer.splitIntoTokens(sampleDocs[1]);
assertEquals(10, tokens1.length);
tokens1 = tokenizer.splitIntoTokens(sampleDocs[2]);
assertEquals(17, tokens1.length);
tokens1 = tokenizer.splitIntoTokens(sampleDocs[3]);
assertEquals(9, tokens1.length);
// Then put each sample into a doc and test again
doc = new Document("doc0", sampleDocs[0]);
tokens2 = tokenizer.splitIntoTokens(doc);
assertEquals(11, tokens2.length);
doc = new Document("doc1", sampleDocs[1]);
tokens2 = tokenizer.splitIntoTokens(doc);
assertEquals(10, tokens2.length);
doc = new Document("doc2", sampleDocs[2]);
tokens2 = tokenizer.splitIntoTokens(doc);
assertEquals(17, tokens2.length);
doc = new Document("doc3", sampleDocs[3]);
tokens2 = tokenizer.splitIntoTokens(doc);
assertEquals(9, tokens2.length);
}
public void testSpanTypeTokenizer() {
System.out.println("Testing the SpanTypeTokenizer");
// Create a text base to house the sample documents
BasicTextBase textBase = new BasicTextBase();
textBase.loadDocument("doc0", sampleDocs[0]);
textBase.loadDocument("doc1", sampleDocs[1]);
textBase.loadDocument("doc2", sampleDocs[2]);
textBase.loadDocument("doc3", sampleDocs[3]);
// Now create a labels set for this text base and add some annotations
BasicTextLabels labels = new BasicTextLabels(textBase);
labels.addToType(new BasicSpan("doc0", textBase.getDocument("doc0").getTokens(), 0, 6, "doc0"), "sentence");
labels.addToType(new BasicSpan("doc0", textBase.getDocument("doc0").getTokens(), 6, 7, "doc0"), "sentence");
labels.addToType(new BasicSpan("doc1", textBase.getDocument("doc1").getTokens(), 0, 8, "doc1"), "sentence");
labels.addToType(new BasicSpan("doc2", textBase.getDocument("doc2").getTokens(), 12, 12, "doc2"), "sentence");
labels.addToType(new BasicSpan("doc3", textBase.getDocument("doc3").getTokens(), 0, 12, "doc3"), "sentence");
// Create a SpanTypeTokenizer to make each sentence a token.
SpanTypeTokenizer spanTypeTokenizer = new SpanTypeTokenizer("sentence", labels);
// Tokenize the sample strings checking to make sure that it uses the base tokenizer
// since without being in the context of a Document, there is no way to reconcile
// this doc back to the parent labels set.
String[] tokens1 = spanTypeTokenizer.splitIntoTokens(sampleDocs[0]);
assertEquals(13, tokens1.length);
tokens1 = spanTypeTokenizer.splitIntoTokens(sampleDocs[1]);
assertEquals(15, tokens1.length);
tokens1 = spanTypeTokenizer.splitIntoTokens(sampleDocs[2]);
assertEquals(24, tokens1.length);
tokens1 = spanTypeTokenizer.splitIntoTokens(sampleDocs[3]);
assertEquals(12, tokens1.length);
// Now put the sample strings into new docs with the same doc ids (a requirement of the
// SpanTypeTokenizer class) and tokenize them with the new Tokenizer to make sure they
// are tokenized into one token per sentence.
Document newDoc0 = new Document("doc0", sampleDocs[0]);
Document newDoc1 = new Document("doc1", sampleDocs[1]);
Document newDoc2 = new Document("doc2", sampleDocs[2]);
Document newDoc3 = new Document("doc3", sampleDocs[3]);
TextToken[] tokens2 = spanTypeTokenizer.splitIntoTokens(newDoc0);
assertEquals(2, tokens2.length);
tokens2 = spanTypeTokenizer.splitIntoTokens(newDoc1);
assertEquals(8, tokens2.length);
tokens2 = spanTypeTokenizer.splitIntoTokens(newDoc2);
assertEquals(13, tokens2.length);
tokens2 = spanTypeTokenizer.splitIntoTokens(newDoc3);
assertEquals(1, tokens2.length);
}
public void testSplitTokenizer() {
System.out.println("Testing the SplitTokenizer");
// Split tokens up by the '.' char.
SplitTokenizer tokenizer = new SplitTokenizer("\\.");
// Test splitting each sample as a string
String[] tokens1 = tokenizer.splitIntoTokens(sampleDocs[0]);
assertEquals(2, tokens1.length);
tokens1 = tokenizer.splitIntoTokens(sampleDocs[1]);
assertEquals(2, tokens1.length);
tokens1 = tokenizer.splitIntoTokens(sampleDocs[2]);
assertEquals(2, tokens1.length);
tokens1 = tokenizer.splitIntoTokens(sampleDocs[3]);
assertEquals(1, tokens1.length);
// Then put each sample into a doc and test again
Document doc = new Document("doc0", sampleDocs[0]);
TextToken[] tokens2 = tokenizer.splitIntoTokens(doc);
assertEquals(2, tokens2.length);
doc = new Document("doc1", sampleDocs[1]);
tokens2 = tokenizer.splitIntoTokens(doc);
assertEquals(2, tokens2.length);
doc = new Document("doc2", sampleDocs[2]);
tokens2 = tokenizer.splitIntoTokens(doc);
assertEquals(2, tokens2.length);
doc = new Document("doc3", sampleDocs[3]);
tokens2 = tokenizer.splitIntoTokens(doc);
assertEquals(1, tokens2.length);
}
/**
* Creates a TestSuite from all testXXX methods
* @return TestSuite
*/
public static Test suite() { return new TestSuite(TokenizerTest.class); }
/**
* Run the full suite of tests with text output
* @param args - unused
*/
public static void main(String args[]) {
junit.textui.TestRunner.run(suite());
}
}