//MaxentTaggerITest -- StanfordMaxEnt, A Maximum Entropy Toolkit
//Copyright (c) 2002-2010 Leland Stanford Junior University
//This program is free software; you can redistribute it and/or
//modify it under the terms of the GNU General Public License
//as published by the Free Software Foundation; either version 2
//of the License, or (at your option) any later version.
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//For more information, bug reports, fixes, contact:
//Christopher Manning
//Dept of Computer Science, Gates 1A
//Stanford CA 94305-9010
//USA
//Support/Questions: java-nlp-user@lists.stanford.edu
//Licensing: java-nlp-support@lists.stanford.edu
//http://www-nlp.stanford.edu/software/tagger.shtml
// Author: John Bauer
// The purpose of this itest is to make sure that the standard tagger
// tags things in the expected manner.
// TODO: add more test cases
package edu.stanford.nlp.tagger.maxent;
import junit.framework.TestCase;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.sequences.PlainTextDocumentReaderAndWriter.OutputStyle;
public class MaxentTaggerITest extends TestCase {
private static MaxentTagger tagger = null;
@Override
public void setUp() throws Exception {
synchronized(MaxentTaggerITest.class) {
if (tagger == null) {
tagger = new MaxentTagger(MaxentTagger.DEFAULT_JAR_PATH);
}
}
}
public void testChooseTokenizer()
throws Exception {
TokenizerFactory<? extends HasWord> tokenizer;
tokenizer = MaxentTagger.chooseTokenizerFactory(false, "", "", false);
assertTrue(tokenizer instanceof WhitespaceTokenizer.WhitespaceTokenizerFactory);
tokenizer = MaxentTagger.chooseTokenizerFactory(true, "", "", false);
assertTrue(tokenizer instanceof PTBTokenizer.PTBTokenizerFactory);
//System.out.println(tokenizer.getClass());
tokenizer = MaxentTagger.chooseTokenizerFactory(true, "edu.stanford.nlp.process.PTBTokenizer$PTBTokenizerFactory", "", false);
assertTrue(tokenizer instanceof PTBTokenizer.PTBTokenizerFactory);
//System.out.println(tokenizer.getClass());
}
public void testTokenizeTest() {
String text = "I think I'll go to Boston. I think I'm starting over. I think I'll start a new life where no one knows my name.";
String []expectedResults = {"[I, think, I, 'll, go, to, Boston, .]",
"[I, think, I, 'm, starting, over, .]",
"[I, think, I, 'll, start, a, new, life, where, no, one, knows, my, name, .]"};
List<List<HasWord>> results = MaxentTagger.tokenizeText(new BufferedReader(new StringReader(text)));
for (int i = 0; i < results.size(); ++i) {
StringWriter result = new StringWriter();
result.write(results.get(i).toString());
assertEquals( expectedResults[i], result.toString());
}
}
private static void compareResults(String []expectedOutput, ArrayList<String> outputStrings) {
assertEquals(expectedOutput.length, outputStrings.size());
for (int i = 0; i < outputStrings.size(); ++i) {
assertEquals(expectedOutput[i].trim(), outputStrings.get(i).trim());
}
}
private static void runRunTaggerTest(boolean emulateStdin, String xmlTag,
String input,
String ... expectedOutput) {
StringWriter output = new StringWriter();
try {
if (emulateStdin) {
tagger.runTaggerStdin(new BufferedReader(new StringReader(input)),
new BufferedWriter(output),
OutputStyle.SLASH_TAGS);
} else {
tagger.runTagger(new BufferedReader(new StringReader(input)),
new BufferedWriter(output), xmlTag,
OutputStyle.SLASH_TAGS);
}
} catch(Exception e) {
throw new RuntimeException(e);
}
//System.out.println(input);
//System.out.println(output.toString());
BufferedReader reader = new BufferedReader(new StringReader(output.toString()));
ArrayList<String> outputStrings = new ArrayList<String>();
try {
for (String outputLine; (outputLine = reader.readLine()) != null; ) {
outputStrings.add(outputLine);
}
} catch(IOException e) {
throw new RuntimeException(e);
}
compareResults(expectedOutput, outputStrings);
}
/**
* Test the stdin handling of runTagger
*/
public void testRunTaggerStdin() {
runRunTaggerTest(true, "",
"This is a test.\nThe cat fought the dog. The dog won because it was much bigger.",
"This_DT is_VBZ a_DT test_NN ._.",
"The_DT cat_NN fought_VBD the_DT dog_NN ._.",
"The_DT dog_NN won_VBD because_IN it_PRP was_VBD much_RB bigger_JJR ._.");
}
/**
* Test the non-console (eg file input) text handling of runTagger
*/
public void testRunTaggerNotStdin() {
runRunTaggerTest(false, "",
"This is another test. This time, the input is not from the console.",
"This_DT is_VBZ another_DT test_NN ._.",
"This_DT time_NN ,_, the_DT input_NN is_VBZ not_RB from_IN the_DT console_NN ._.");
}
/**
* Test the non-console xml handling the runTagger
*/
public void testRunTaggerXML() {
runRunTaggerTest(false, "text",
"<tagger>\n <text>\n This tests the xml input.\n </text> \n This should not be tagged. \n <text>\n This should be tagged.\n </text>\n <text>\n The dog's barking kept the\n neighbors up all night.\n </text>\n</tagging>",
"This_DT tests_VBZ the_DT xml_NN input_NN ._.",
"This_DT should_MD be_VB tagged_VBN ._.",
"The_DT dog_NN 's_POS barking_VBG kept_VBD the_DT neighbors_NNS up_IN all_DT night_NN ._.");
}
public void testRunTaggerXML2Tags() {
runRunTaggerTest(false, "foo|bar",
"<tagger>\n <foo>\n This tests the xml input.\n </foo> \n This should not be tagged. \n <bar>\n This should be tagged.\n </bar>\n <foo>\n The dog's barking kept the\n neighbors up all night.\n </foo>\n</tagging>",
"This_DT tests_VBZ the_DT xml_NN input_NN ._.",
"This_DT should_MD be_VB tagged_VBN ._.",
"The_DT dog_NN 's_POS barking_VBG kept_VBD the_DT neighbors_NNS up_IN all_DT night_NN ._.");
}
public void testRunTaggerManyTags() {
runRunTaggerTest(false, "text.*",
"<tagger>\n <text1>\n This tests the xml input.\n </text1> \n This should not be tagged. \n <text2>\n This should be tagged.\n </text2>\n <text3>\n The dog's barking kept the\n neighbors up all night.\n </text3>\n</tagging>",
"This_DT tests_VBZ the_DT xml_NN input_NN ._.",
"This_DT should_MD be_VB tagged_VBN ._.",
"The_DT dog_NN 's_POS barking_VBG kept_VBD the_DT neighbors_NNS up_IN all_DT night_NN ._.");
}
private static void runTagFromXMLTest(String input,
String expectedOutput, String ... tags) {
StringWriter outputWriter = new StringWriter();
tagger.tagFromXML(new BufferedReader(new StringReader(input)),
new BufferedWriter(outputWriter), tags);
String actualOutput = outputWriter.toString().replaceAll("\\s+", " ");
expectedOutput = expectedOutput.replaceAll("\\s+", " ");
//System.out.println("'" + actualOutput + "'");
//System.out.println("'" + expectedOutput + "'");
assertEquals(expectedOutput.trim(), actualOutput.trim());
}
public void testTagFromXMLSimple() {
String input = "<tagger><foo>This should be tagged</foo></tagger>";
String output = "<tagger> <foo> This_DT should_MD be_VB tagged_VBN </foo> </tagger>";
runTagFromXMLTest(input, output, "foo");
}
public void testTagFromXMLTwoTags() {
String input = "<tagger><foo>This should be tagged</foo>This should not<bar>This should also be tagged</bar></tagger>";
String output = "<tagger> <foo> This_DT should_MD be_VB tagged_VBN </foo> This should not<bar> This_DT should_MD also_RB be_VB tagged_VBN </bar> </tagger>";
runTagFromXMLTest(input, output, "foo", "bar");
}
public void testTagFromXMLNested() {
String input = "<tagger><foo><bar>This should be tagged</bar></foo></tagger>";
String output = "<tagger> <foo> This_DT should_MD be_VB tagged_VBN </foo> </tagger>";
runTagFromXMLTest(input, output, "foo", "bar");
}
public void testTagFromXMLSingleTag() {
String input = "<tagger><foo>I have no idea what this will output</foo><bar/>but this should not be tagged<bar>this should be tagged</bar></tagger>";
String output = "<tagger> <foo> I_PRP have_VBP no_DT idea_NN what_WP this_DT will_MD output_NN </foo> <bar> </bar> but this should not be tagged<bar> this_DT should_MD be_VB tagged_VBN </bar> </tagger> ";
runTagFromXMLTest(input, output, "foo", "bar");
}
public void testTagFromXMLEscaping() {
String input = "<tagger><foo>A simple math formula is 5 < 6</foo> which is the same as 6 > 5</tagger>";
// the JJR tag here is wrong, but that's a tagger training data issue.
String output = "<tagger> <foo> A_DT simple_JJ math_NN formula_NN is_VBZ 5_CD <_JJR 6_CD </foo> which is the same as 6 > 5</tagger>";
runTagFromXMLTest(input, output, "foo", "bar");
}
public void testTagString() {
String input = "My dog is fluffy and white and has a fluffy tail.";
String expectedOutput = "My_PRP$ dog_NN is_VBZ fluffy_JJ and_CC white_JJ and_CC has_VBZ a_DT fluffy_JJ tail_NN ._.";
String output = tagger.tagString(input).trim();
assertEquals(expectedOutput, output);
}
public void testTagCoreLabels() {
List<CoreLabel> words = new ArrayList<CoreLabel>();
String[] testWords = {"I", "think", "I", "'ll",
"go", "to", "Boston", "."};
for (String word : testWords) {
CoreLabel label = new CoreLabel(new Word(word));
label.setWord(label.value());
words.add(label);
}
tagger.tagCoreLabels(words);
String[] expectedTags = {"PRP", "VBP", "PRP", "MD",
"VB", "TO", "NNP", "."};
assertEquals(expectedTags.length, words.size());
for (int i = 0; i < expectedTags.length; ++i) {
assertEquals(expectedTags[i], words.get(i).tag());
}
}
public void testTaggerWrapper() {
TaggerConfig config = new TaggerConfig(tagger.config);
config.setProperty("tokenize", "false");
MaxentTagger.TaggerWrapper wrapper =
new MaxentTagger.TaggerWrapper(tagger);
String query = "This is a test . What is the result of two sentences ?";
String expectedResult = "This_DT is_VBZ a_DT test_NN ._. " +
"What_WP is_VBZ the_DT result_NN of_IN two_CD sentences_NNS ?_.";
String result = wrapper.apply(query).trim();
assertEquals(expectedResult, result);
}
}