package edu.cmu.minorthird.text;
import java.io.File;
import java.util.Iterator;
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
import org.apache.log4j.Logger;
/**
*
* This class is responsible for...
*
* @author ksteppe
*/
public class TextBaseLoaderTest extends TestCase
{
Logger log = Logger.getLogger(this.getClass());
protected final String testCaseDir="test/edu/cmu/minorthird/text/testcases";
/**
* Standard test class constructior for TextBaseLoaderTest
* @param name Name of the test
*/
public TextBaseLoaderTest(String name) {
super(name);
}
/** Convinence constructior for TextBaseLoaderTest */
public TextBaseLoaderTest() {
super("TextBaseLoaderTest");
}
/** setUp to run before each test */
protected void setUp() {
Logger.getRootLogger().removeAllAppenders();
org.apache.log4j.BasicConfigurator.configure();
//TODO add initializations if needed
}
/** clean up to run after each test */
protected void tearDown() {
//TODO clean up resources if needed
}
//
// The Tests
//
/**
* Begin by testing the basic loading of a data file functionality. It tests both using the default
* tokenizer or specifying a custom tokenizer.
*/
public void testLoadDataFile() {
try {
//
// Try the basic test of just loading a file with good data (ie no blamk lines, etc) using the standard tokenizer
//
TextBaseLoader loader = new TextBaseLoader(TextBaseLoader.DOC_PER_LINE, TextBaseLoader.USE_XML);
TextBase base = loader.load(new File(testCaseDir+"/DocPerLineTestData.base"));
MutableTextLabels labels = loader.getLabels();
// Check that the proper number of documents were loaded from the file.
assertEquals(7, base.size());
// Check that all the docs were loaded correctly
Span docSpan = base.documentSpan("DocPerLineTestData.base@line:1");
assertNotNull(docSpan);
assertEquals(19, docSpan.size());
docSpan = base.documentSpan("DocPerLineTestData.base@line:2");
assertNotNull(docSpan);
assertEquals(12, docSpan.size());
docSpan = base.documentSpan("DocPerLineTestData.base@line:3");
assertNotNull(docSpan);
assertEquals(12, docSpan.size());
docSpan = base.documentSpan("DocPerLineTestData.base@line:4");
assertNotNull(docSpan);
assertEquals(19, docSpan.size());
docSpan = base.documentSpan("DocPerLineTestData.base@line:5");
assertNotNull(docSpan);
assertEquals(11, docSpan.size());
docSpan = base.documentSpan("DocPerLineTestData.base@line:6");
assertNotNull(docSpan);
assertEquals(17, docSpan.size());
docSpan = base.documentSpan("DocPerLineTestData.base@line:7");
assertNotNull(docSpan);
assertEquals(6, docSpan.size());
// Lastly make sure that all embedded types were loaded
this.checkType(labels, "stime", "DocPerLineTestData.base@line:1", "4:00", 1);
this.checkType(labels, "location", "DocPerLineTestData.base@line:1", "Adamson Wing, Baker Hall", 1);
this.checkType(labels, "speaker", "DocPerLineTestData.base@line:2", "George W. Cobb", 1);
this.checkType(labels, "title", "DocPerLineTestData.base@line:3", "Title: Three Ways to Gum up a Statistics Course", 1);
this.checkType(labels, "sentence", "DocPerLineTestData.base@line:4", "My talk will be in two parts", 1);
this.checkType(labels, "comment", "DocPerLineTestData.base@line:5", "comments and observations", 1);
this.checkType(labels, "country", "DocPerLineTestData.base@line:6", "US", 1);
//
// Next repeat these tests with a dataset that has blank lines in it to make sure these are skipped
//
loader = new TextBaseLoader(TextBaseLoader.DOC_PER_LINE, TextBaseLoader.USE_XML);
base = loader.load(new File(testCaseDir+"/DocPerLineTestData_WithBlanks.base"));
labels = loader.getLabels();
// Check that the proper number of documents were loaded from the file.
//WARNING: THIS IS A KNOWN BUG THAT BLANK LINES ARE INCLUDED. LEAVE THE TEST FAILING
// TO REMIND US TO FIX THE BUG!!!
assertEquals(7, base.size());
// Check that all the docs were loaded correctly
docSpan = base.documentSpan("DocPerLineTestData_WithBlanks.base@line:4");
assertNotNull(docSpan);
assertEquals(19, docSpan.size());
docSpan = base.documentSpan("DocPerLineTestData_WithBlanks.base@line:5");
assertNotNull(docSpan);
assertEquals(12, docSpan.size());
docSpan = base.documentSpan("DocPerLineTestData_WithBlanks.base@line:6");
assertNotNull(docSpan);
assertEquals(12, docSpan.size());
docSpan = base.documentSpan("DocPerLineTestData_WithBlanks.base@line:7");
assertNotNull(docSpan);
assertEquals(19, docSpan.size());
docSpan = base.documentSpan("DocPerLineTestData_WithBlanks.base@line:11");
assertNotNull(docSpan);
assertEquals(11, docSpan.size());
docSpan = base.documentSpan("DocPerLineTestData_WithBlanks.base@line:12");
assertNotNull(docSpan);
assertEquals(17, docSpan.size());
docSpan = base.documentSpan("DocPerLineTestData_WithBlanks.base@line:13");
assertNotNull(docSpan);
assertEquals(6, docSpan.size());
// Lastly make sure that all embedded types were loaded
this.checkType(labels, "stime", "DocPerLineTestData_WithBlanks.base@line:4", "4:00", 1);
this.checkType(labels, "location", "DocPerLineTestData_WithBlanks.base@line:4", "Adamson Wing, Baker Hall", 1);
this.checkType(labels, "speaker", "DocPerLineTestData_WithBlanks.base@line:5", "George W. Cobb", 1);
this.checkType(labels, "title", "DocPerLineTestData_WithBlanks.base@line:6", "Title: Three Ways to Gum up a Statistics Course", 1);
this.checkType(labels, "sentence", "DocPerLineTestData_WithBlanks.base@line:7", "My talk will be in two parts", 1);
this.checkType(labels, "comment", "DocPerLineTestData_WithBlanks.base@line:11", "comments and observations", 1);
this.checkType(labels, "country", "DocPerLineTestData_WithBlanks.base@line:12", "US", 1);
}
catch (Exception e) {
log.fatal(e.getMessage(), e);
fail("testLoadDataFile failed because an exception occurred: " + e.getMessage());
}
}
public void testLoadDirectoryOfFiles() {
try {
//First test loading a directory of files that have embedded labels
TextBaseLoader loader = new TextBaseLoader(TextBaseLoader.DOC_PER_FILE, TextBaseLoader.USE_XML);
TextBase base = loader.load(new File(testCaseDir+"/SeminarAnnouncements"));
MutableTextLabels labels = loader.getLabels();
// Check that the proper number of documents were loaded from the directory.
assertEquals(15, base.size());
// Check that all the docs were loaded correctly
assertNotNull(base.documentSpan("cmu.andrew.official.cmu-news-2450_0"));
assertNotNull(base.documentSpan("cmu.andrew.official.cmu-news-2457_0"));
assertNotNull(base.documentSpan("cmu.andrew.official.cmu-news-2477_0"));
assertNotNull(base.documentSpan("cmu.andrew.official.cmu-news-2513_0"));
assertNotNull(base.documentSpan("cmu.andrew.official.cmu-news-2516_0"));
assertNotNull(base.documentSpan("cmu.andrew.official.cmu-news-2527_0"));
assertNotNull(base.documentSpan("cmu.andrew.official.cmu-news-2611_0"));
assertNotNull(base.documentSpan("cmu.andrew.official.cmu-news-2627_0"));
assertNotNull(base.documentSpan("cmu.andrew.official.cmu-news-2633_0"));
assertNotNull(base.documentSpan("cmu.andrew.official.cmu-news-2674_0"));
assertNotNull(base.documentSpan("cmu.andrew.official.cmu-news-2680_0"));
assertNotNull(base.documentSpan("cmu.andrew.official.cmu-news-2737_0"));
assertNotNull(base.documentSpan("cmu.andrew.official.cmu-news-2752_0"));
assertNotNull(base.documentSpan("cmu.andrew.official.cmu-news-2811_0"));
assertNotNull(base.documentSpan("cmu.andrew.official.cmu-news-2912_1"));
// Check the the correct number of instances of each label were loaded
assertEquals(3, this.getNumLabels(labels, "etime"));
assertEquals(15, this.getNumLabels(labels, "location"));
assertEquals(18, this.getNumLabels(labels, "paragraph"));
assertEquals(49, this.getNumLabels(labels, "sentence"));
assertEquals(12, this.getNumLabels(labels, "speaker"));
assertEquals(14, this.getNumLabels(labels, "stime"));
// Check a number of the embedded type to make sure they were loaded correctly
this.checkType(labels, "etime", "cmu.andrew.official.cmu-news-2611_0", "5:00 P.M.", 1);
this.checkType(labels, "etime", "cmu.andrew.official.cmu-news-2457_0", "1:00pm", 1);
this.checkType(labels, "etime", "cmu.andrew.official.cmu-news-2674_0", "1:30 p.m.", 1);
this.checkType(labels, "location", "cmu.andrew.official.cmu-news-2527_0", "Baker Hall 235A", 1);
this.checkType(labels, "speaker", "cmu.andrew.official.cmu-news-2611_0", "FARRO F. RADJY, PH.D.", 1);
this.checkType(labels, "stime", "cmu.andrew.official.cmu-news-2752_0", "12:00 pm", 1);
}
catch(Exception e) {
log.fatal(e.getMessage(), e);
fail("testLoadDirectoryOfFiles failed because an exception occurred: " + e.getMessage());
}
}
public void testLoadWordPerLineFile() {
try {
TextBaseLoader loader = new TextBaseLoader();
TextBase base = loader.loadWordPerLineFile(new File(testCaseDir+"/eng.base"));
MutableTextLabels labels = loader.getLabels();
// Check that the proper number of documents were loaded from the file.
assertEquals(216, base.size());
// Check a couple of the docs to make sure that the text was loaded correctly
String msg1 = "BOXING - JOHNSON WINS UNANIMOUS POIUNTS VERDICT . DUBLIN 1996-08-31 American Tom Johnson successfully defended his IBF featherweight title when he earned a unanimous points decision over Venezuela 's Ramon Guzman on Saturday . ";
String msg2 = "SOCCER - RESULT IN SPANISH FIRST DIVISION . MADRID 1996-08-31 Result of game played in the Spanish first division on Saturday : Deportivo Coruna 1 Real Madrid 1 ";
String msg3 = "SOCCER - ARMENIA AND PORTUGAL DRAW 0-0 IN WORLD CUP QUALIFIER . YEREVAN 1996-08-31 Armenia and Portugal drew 0-0 in a World Cup soccer European group 9 qualifier on Saturday . Attendance : 5,000 ";
String msg4 = "SOCCER - AUSTRIA DRAW 0-0 WITH SCOTLAND IN WORLD CUP QUALIFIER . VIENNA 1996-08-31 Austria and Scotland drew 0-0 in a World Cup soccer European group four qualifier on Saturday . Attendance : 29,500 ";
String msg5 = "BASKETBALL - INTERNATIONAL TOURNAMENT RESULT . BELGRADE 1996-08-30 Result in an international basketball tournament on Friday : Red Star ( Yugoslavia ) beat Dinamo ( Russia ) 92-90 ( halftime 47-47 ) ";
String msg6 = "RUGBY LEAGUE - WIGAN BEAT BRADFORD 42-36 IN SEMIFINAL . WIGAN , England 1996-08-31 Result of English rugby league premiership semifinal played on Saturday : Wigan 42 Bradford Bulls 36 ";
assertEquals(msg1, base.getDocument("eng.base-155").getText());
assertEquals(msg2, base.getDocument("eng.base-160").getText());
assertEquals(msg3, base.getDocument("eng.base-136").getText());
assertEquals(msg4, base.getDocument("eng.base-162").getText());
assertEquals(msg5, base.getDocument("eng.base-5").getText());
assertEquals(msg6, base.getDocument("eng.base-102").getText());
// Some of the fields in this format get translated to span types. Check that these
// get created (and in the correct amounts).
assertEquals(4, this.getNumLabels(labels, "B-MISC"));
assertEquals(2094, this.getNumLabels(labels, "I-LOC"));
assertEquals(1264, this.getNumLabels(labels, "I-MISC"));
assertEquals(2092, this.getNumLabels(labels, "I-ORG"));
assertEquals(3149, this.getNumLabels(labels, "I-PER"));
}
catch (Exception e) {
log.fatal(e.getMessage(), e);
fail("testLoadWordPerLineFile failed because an exception occurred: " + e.getMessage());
}
}
//
// Helper methods
//
// returns the number of times the given type appears in the doc
private int getNumLabels(TextLabels labels, String type) {
int i = 0;
for (Iterator<Span> l = labels.instanceIterator(type); l.hasNext(); ) {
log.debug(l.next().asString());
i++;
}
return i;
}
// Asserts that there is an instance of the specified type, that this instance has the specified
// value and that it appears (with that value) the specified number of times
private void checkType(TextLabels labels, String type, String doc, String value, int num) {
int i = 0;
for (Iterator<Span> l = labels.instanceIterator(type, doc); l.hasNext(); i++) {
Span s = l.next();
assertEquals(value, s.asString());
}
assertEquals(num, i);
}
/**
* Creates a TestSuite from all testXXX methods
* @return TestSuite
*/
public static Test suite()
{
return new TestSuite(TextBaseLoaderTest.class);
}
/**
* Run the full suite of tests with text output
* @param args - unused
*/
public static void main(String args[])
{
junit.textui.TestRunner.run(suite());
}
}