package edu.stanford.nlp.pipeline;
import edu.stanford.nlp.util.PropertiesUtils;
import junit.framework.TestCase;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.CoreMap;
/**
* @author John Bauer
*/
public class CleanXmlAnnotatorTest extends TestCase {
private static Annotator ptbInvertible; // = null;
private static Annotator ptbNotInvertible; // = null;
private static Annotator cleanXmlAllTags; // = null;
private static Annotator cleanXmlSomeTags; // = null;
private static Annotator cleanXmlEndSentences; // = null;
private static Annotator cleanXmlWithFlaws; // = null;
private static Annotator wtsSplitter; // = null;
/**
* Initialize the annotators at the start of the unit test.
* If they've already been initialized, do nothing.
*/
@Override
public void setUp() throws Exception {
super.setUp();
synchronized(CleanXmlAnnotatorTest.class) {
if (ptbInvertible == null) {
ptbInvertible =
new TokenizerAnnotator(false, "en", "invertible,ptb3Escaping=true");
}
if (ptbNotInvertible == null) {
ptbNotInvertible =
new TokenizerAnnotator(false, "en",
"invertible=false,ptb3Escaping=true");
}
if (cleanXmlAllTags == null) {
cleanXmlAllTags = new CleanXmlAnnotator(".*", "", "", false);
}
if (cleanXmlSomeTags == null) {
cleanXmlSomeTags = new CleanXmlAnnotator("p", "", "", false);
}
if (cleanXmlEndSentences == null) {
cleanXmlEndSentences = new CleanXmlAnnotator(".*", "p", "", false);
}
if (cleanXmlWithFlaws == null) {
cleanXmlWithFlaws = new CleanXmlAnnotator(".*", "", "", true);
}
if (wtsSplitter == null) {
wtsSplitter = new WordsToSentencesAnnotator(false);
}
}
}
public static Annotation annotate(String text,
Annotator tokenizer, Annotator xmlRemover,
Annotator splitter) {
Annotation annotation = new Annotation(text);
tokenizer.annotate(annotation);
if (xmlRemover != null)
xmlRemover.annotate(annotation);
if (splitter != null)
splitter.annotate(annotation);
return annotation;
}
private static void checkResult(Annotation annotation,
String... gold) {
List<CoreLabel> goldTokens = new ArrayList<>();
Annotation[] goldAnnotations = new Annotation[gold.length];
for (int i = 0; i < gold.length; ++i) {
goldAnnotations[i] = annotate(gold[i], ptbInvertible, null, null);
goldTokens.addAll(goldAnnotations[i].get(CoreAnnotations.TokensAnnotation.class));
}
List<CoreLabel> annotationLabels = annotation.get(CoreAnnotations.TokensAnnotation.class);
if (goldTokens.size() != annotationLabels.size()) {
for (CoreLabel annotationLabel : annotationLabels) {
System.err.print(annotationLabel.word());
System.err.print(' ');
}
System.err.println();
for (CoreLabel goldToken : goldTokens) {
System.err.print(goldToken.word());
System.err.print(' ');
}
System.err.println();
}
assertEquals("Token count mismatch (gold vs: actual)", goldTokens.size(), annotationLabels.size());
for (int i = 0; i < annotationLabels.size(); ++i) {
assertEquals(goldTokens.get(i).word(),
annotationLabels.get(i).word());
}
if (annotation.get(CoreAnnotations.SentencesAnnotation.class) != null) {
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
assertEquals("Sentence count mismatch", gold.length, sentences.size());
}
}
private static void checkInvert(Annotation annotation, String gold) {
List<CoreLabel> annotationLabels =
annotation.get(CoreAnnotations.TokensAnnotation.class);
StringBuilder original = new StringBuilder();
for (CoreLabel label : annotationLabels) {
original.append(label.get(CoreAnnotations.BeforeAnnotation.class));
original.append(label.get(CoreAnnotations.OriginalTextAnnotation.class));
}
original.append(annotationLabels.get(annotationLabels.size() - 1).
get(CoreAnnotations.AfterAnnotation.class));
assertEquals(gold, original.toString());
}
private static void checkContext(CoreLabel label, String... expectedContext) {
List<String> xmlContext = label.get(CoreAnnotations.XmlContextAnnotation.class);
assertEquals(expectedContext.length, xmlContext.size());
for (int i = 0; i < expectedContext.length; ++i) {
assertEquals(expectedContext[i], xmlContext.get(i));
}
}
public void testRemoveXML() {
String testString = "<xml>This is a test string.</xml>";
checkResult(annotate(testString, ptbInvertible,
cleanXmlAllTags, wtsSplitter),
"This is a test string.");
}
public void testExtractSpecificTag() {
String testString = ("<p>This is a test string.</p>" +
"<foo>This should not be found</foo>");
checkResult(annotate(testString, ptbInvertible,
cleanXmlSomeTags, wtsSplitter),
"This is a test string.");
}
public void testSentenceSplitting() {
String testString = ("<p>This sentence is split</p>" +
"<foo>over two tags</foo>");
checkResult(annotate(testString, ptbInvertible,
cleanXmlAllTags, wtsSplitter),
"This sentence is split over two tags");
checkResult(annotate(testString, ptbInvertible,
cleanXmlEndSentences, wtsSplitter),
"This sentence is split", "over two tags");
}
public void testNestedTags() {
String testString = "<p><p>This text is in a</p>nested tag</p>";
checkResult(annotate(testString, ptbInvertible,
cleanXmlAllTags, wtsSplitter),
"This text is in a nested tag");
checkResult(annotate(testString, ptbInvertible,
cleanXmlEndSentences, wtsSplitter),
"This text is in a", "nested tag");
}
public void testMissingCloseTags() {
String testString = "<text><p>This text <p>has closing tags wrong</text>";
checkResult(annotate(testString, ptbInvertible,
cleanXmlWithFlaws, wtsSplitter),
"This text has closing tags wrong");
try {
checkResult(annotate(testString, ptbInvertible,
cleanXmlAllTags, wtsSplitter),
"This text has closing tags wrong");
throw new RuntimeException("it was supposed to barf");
} catch(IllegalArgumentException e) {
// this is what was supposed to happen
}
}
public void testEarlyEnd() {
String testString = "<text>This text ends before all tags closed";
checkResult(annotate(testString, ptbInvertible,
cleanXmlWithFlaws, wtsSplitter),
"This text ends before all tags closed");
try {
checkResult(annotate(testString, ptbInvertible,
cleanXmlAllTags, wtsSplitter),
"This text ends before all tags closed");
throw new RuntimeException("it was supposed to barf");
} catch(IllegalArgumentException e) {
// this is what was supposed to happen
}
}
public void testInvertible() {
String testNoTags = "This sentence should be invertible.";
String testTags =
" <xml> This sentence should be invertible. </xml> ";
String testManyTags =
" <xml> <foo> <bar>This sentence should " +
" </bar>be invertible. </foo> </xml> ";
Annotation annotation = annotate(testNoTags, ptbInvertible,
cleanXmlAllTags, wtsSplitter);
checkResult(annotation, testNoTags);
checkInvert(annotation, testNoTags);
annotation = annotate(testTags, ptbInvertible,
cleanXmlAllTags, wtsSplitter);
checkResult(annotation, testNoTags);
checkInvert(annotation, testTags);
annotation = annotate(testManyTags, ptbInvertible,
cleanXmlAllTags, wtsSplitter);
checkResult(annotation, testNoTags);
checkInvert(annotation, testManyTags);
}
public void testContext() {
String testManyTags =
" <xml> <foo> <bar>This sentence should " +
" </bar>be invertible. </foo> </xml> ";
Annotation annotation = annotate(testManyTags, ptbInvertible,
cleanXmlAllTags, wtsSplitter);
List<CoreLabel> annotationLabels =
annotation.get(CoreAnnotations.TokensAnnotation.class);
for (int i = 0; i < 3; ++i) {
checkContext(annotationLabels.get(i), "xml", "foo", "bar");
}
for (int i = 3; i < 5; ++i) {
checkContext(annotationLabels.get(i), "xml", "foo");
}
}
public void testOffsets() {
String testString = "<p><p>This text is in a</p>nested tag</p>";
Annotation annotation = annotate(testString, ptbInvertible,
cleanXmlAllTags, wtsSplitter);
checkResult(annotation, "This text is in a nested tag");
List<CoreLabel> labels = annotation.get(CoreAnnotations.TokensAnnotation.class);
assertEquals(6,
labels.get(0).
get(CoreAnnotations.CharacterOffsetBeginAnnotation.class).intValue());
assertEquals(10,
labels.get(0).
get(CoreAnnotations.CharacterOffsetEndAnnotation.class).intValue());
}
public void testAttributes() {
String testString = "<p a=\"b\">This text has an attribute</p>";
Annotation annotation = annotate(testString, ptbInvertible,
cleanXmlAllTags, wtsSplitter);
checkResult(annotation, "This text has an attribute");
}
public void testViaCoreNlp() {
String testManyTags =
" <xml> <foo> <bar>This sentence should " +
" </bar>be invertible. </foo> </xml> ";
Annotation anno = new Annotation(testManyTags);
Properties props = PropertiesUtils.asProperties(
"annotators", "tokenize, ssplit, cleanxml",
"tokenizer.options", "invertible,ptb3Escaping=true",
"cleanxml.xmltags", ".*",
"cleanxml.sentenceendingtags", "p",
"cleanxml.datetags", "",
"cleanxml.allowflawedxml", "false"
);
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
pipeline.annotate(anno);
checkInvert(anno, testManyTags);
List<CoreLabel> annotationLabels =
anno.get(CoreAnnotations.TokensAnnotation.class);
for (int i = 0; i < 3; ++i) {
checkContext(annotationLabels.get(i), "xml", "foo", "bar");
}
for (int i = 3; i < 5; ++i) {
checkContext(annotationLabels.get(i), "xml", "foo");
}
}
}