package edu.stanford.nlp.pipeline; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.util.StringUtils; import junit.framework.TestCase; import java.io.File; import java.io.PrintWriter; import java.util.List; import java.util.Properties; /** * Test cases for TokensRegexNERAnnotator (taken from RegexNERAnnotator) * @author Angel Chang */ public class TokensRegexNERAnnotatorITest extends TestCase { private static final String REGEX_ANNOTATOR_NAME = "tokensregexner"; private static final String MAPPING = "/u/nlp/data/TAC-KBP2010/sentence_extraction/itest_map"; private static StanfordCoreNLP pipeline; private static Annotator caseless; private static Annotator cased; private static Annotator annotator; @Override public void setUp() throws Exception { synchronized(TokensRegexNERAnnotatorITest.class) { if (pipeline == null) { // Hack so we don't load the pipeline fresh for every test Properties props = new Properties(); props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner"); pipeline = new StanfordCoreNLP(props); // Basic caseless and cased tokens regex annotators caseless = new TokensRegexNERAnnotator(MAPPING, true); cased = new TokensRegexNERAnnotator(MAPPING); annotator = cased; } } } // Helper methods protected static TokensRegexNERAnnotator getTokensRegexNerAnnotator(Properties props) { return new TokensRegexNERAnnotator(REGEX_ANNOTATOR_NAME, props); } protected static TokensRegexNERAnnotator getTokensRegexNerAnnotator(String[][] patterns, boolean ignoreCase) throws Exception { return getTokensRegexNerAnnotator(new Properties(), patterns, ignoreCase); } protected static TokensRegexNERAnnotator getTokensRegexNerAnnotator(Properties props, String[][] patterns, boolean ignoreCase) throws Exception { File tempFile = File.createTempFile("tokensregexnertest.patterns", "txt"); tempFile.deleteOnExit(); PrintWriter pw = IOUtils.getPrintWriter(tempFile.getAbsolutePath()); for (String[] p: patterns) { pw.println(StringUtils.join(p, "\t")); } pw.close(); props.setProperty(REGEX_ANNOTATOR_NAME + ".mapping", tempFile.getAbsolutePath()); props.setProperty(REGEX_ANNOTATOR_NAME + ".ignorecase", String.valueOf(ignoreCase)); return new TokensRegexNERAnnotator(REGEX_ANNOTATOR_NAME, props); } protected static Annotation createDocument(String text) { Annotation annotation = new Annotation(text); pipeline.annotate(annotation); return annotation; } /** * Helper method, checks that each token is tagged with the expected NER type. */ private static void checkNerTags(List<CoreLabel> tokens, String... tags) { assertEquals(tags.length, tokens.size()); for (int i = 0; i < tags.length; ++i) { assertEquals("Mismatch for token tag NER " + i + " " + tokens.get(i), tags[i], tokens.get(i).get(CoreAnnotations.NamedEntityTagAnnotation.class)); } } private static void checkTags(List<CoreLabel> tokens, Class key, String... tags) { assertEquals(tags.length, tokens.size()); for (int i = 0; i < tags.length; ++i) { assertEquals("Mismatch for token tag " + key + " " + i + " " + tokens.get(i), tags[i], tokens.get(i).get(key)); } } /** * Helper method, re-annotate each token with specified tag */ private static void reannotate(List<CoreLabel> tokens, Class key, String ... tags) { assertEquals(tags.length, tokens.size()); for (int i = 0; i < tags.length; ++i) { tokens.get(i).set(key, tags[i]); } } // Tests for TokensRegex syntax public void testTokensRegexSyntax() throws Exception { String[][] regexes = new String[][]{ new String[]{"( /University/ /of/ [ {ner:LOCATION} ] )", "SCHOOL"} // TODO: TokensRegex literal string patterns ignores ignoreCase settings //new String[]{"( University of [ {ner:LOCATION} ] )", "SCHOOL"} }; Annotator annotatorCased = getTokensRegexNerAnnotator(regexes, false); String str = "University of Alaska is located in Alaska."; Annotation document = createDocument(str); annotatorCased.annotate(document); List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class); checkNerTags(tokens, "ORGANIZATION", "ORGANIZATION", "ORGANIZATION", "O", "O", "O", "LOCATION", "O"); reannotate(tokens, CoreAnnotations.NamedEntityTagAnnotation.class, "O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O"); annotatorCased.annotate(document); checkNerTags(tokens, "SCHOOL", "SCHOOL", "SCHOOL", "O", "O", "O", "LOCATION", "O"); // Try lowercase Annotator annotatorCaseless = getTokensRegexNerAnnotator(regexes, true); str = "university of alaska is located in alaska."; document = createDocument(str); tokens = document.get(CoreAnnotations.TokensAnnotation.class); checkNerTags(tokens, "O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O"); annotatorCased.annotate(document); checkNerTags(tokens, "O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O"); annotatorCaseless.annotate(document); checkNerTags(tokens, "SCHOOL", "SCHOOL", "SCHOOL", "O", "O", "O", "LOCATION", "O"); } // Tests for TokensRegex syntax with match group public void testTokensRegexMatchGroup() throws Exception { String[][] regexes = new String[][]{ new String[]{"( /the/? /movie/ (/[A-Z].*/+) )", "MOVIE", "", "0", "1"} }; Annotator annotatorCased = getTokensRegexNerAnnotator(regexes, false); String str = "the movie Mud was very muddy"; Annotation document = createDocument(str); annotatorCased.annotate(document); List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class); checkNerTags(tokens, "O", "O", "MOVIE", "O", "O", "O"); } // Tests for TokensRegexNer annotator annotating other fields public void testTokensRegexNormalizedAnnotate() throws Exception { Properties props = new Properties(); props.setProperty(REGEX_ANNOTATOR_NAME + ".mapping.header", "pattern,ner,normalized,overwrite,priority,group"); String[][] regexes = new String[][]{ new String[]{"blue", "COLOR", "B", "", "0"}, new String[]{"red", "COLOR", "R", "", "0"}, new String[]{"green", "COLOR", "G", "", "0"} }; Annotator annotatorCased = getTokensRegexNerAnnotator(props, regexes, false); String str = "These are all colors: blue, red, and green."; Annotation document = createDocument(str); annotatorCased.annotate(document); List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class); checkTags(tokens, CoreAnnotations.TextAnnotation.class, "These", "are", "all", "colors", ":", "blue", ",", "red", ",", "and", "green", "."); checkTags(tokens, CoreAnnotations.NamedEntityTagAnnotation.class, "O", "O", "O", "O", "O", "COLOR", "O", "COLOR", "O", "O", "COLOR", "O"); checkTags(tokens, CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, null, null, null, null, null, "B", null, "R", null, null, "G", null); } public static class TestAnnotation implements CoreAnnotation<String> { public Class<String> getType() { return String.class; } } // Tests for TokensRegexNer annotator annotating other fields with custom key mapping public void testTokensRegexCustomAnnotate() throws Exception { Properties props = new Properties(); props.setProperty(REGEX_ANNOTATOR_NAME + ".mapping.header", "pattern,test,overwrite,priority,group"); props.setProperty(REGEX_ANNOTATOR_NAME + ".mapping.field.test", "edu.stanford.nlp.pipeline.TokensRegexNERAnnotatorITest$TestAnnotation"); String[][] regexes = new String[][]{ new String[]{"test", "TEST", "", "0"} }; Annotator annotatorCased = getTokensRegexNerAnnotator(props, regexes, true); String str = "Marking all test as test"; Annotation document = createDocument(str); annotatorCased.annotate(document); List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class); checkTags(tokens, CoreAnnotations.TextAnnotation.class, "Marking", "all", "test", "as", "test"); checkTags(tokens, TestAnnotation.class, null, null, "TEST", null, "TEST"); } // Basic tests from RegexNERAnnotatorITest public void testBasicMatching() throws Exception { String str = "President Barack Obama lives in Chicago , Illinois , " + "and is a practicing Christian ."; Annotation document = createDocument(str); annotator.annotate(document); List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class); checkNerTags(tokens, "TITLE", "PERSON", "PERSON", "O", "O", "LOCATION", "O", "STATE_OR_PROVINCE", "O", "O", "O", "O", "O", "IDEOLOGY", "O"); } /** * The ORGANIZATION on Ontario Bank should not ve overrwritten since Ontario (STATE_OR_PROVINCE) * does not span Ontario Bank. Nevertheless, by the special Chinese KBP 2016 hack, the LOCATION on Ontario Lake * should be overwritten. Native American Church will overwrite ORGANIZATION with * RELIGION. */ public void testOverwrite() throws Exception { String str = "I like Ontario Bank and Ontario Lake , and I like the Native American Church , too ."; Annotation document = createDocument(str); annotator.annotate(document); List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class); checkNerTags(tokens, "O", "O", "ORGANIZATION", "ORGANIZATION", "O", "STATE_OR_PROVINCE", "LOCATION", "O", "O", "O", "O", "O", "RELIGION", "RELIGION", "RELIGION", "O", "O", "O"); } /** * In the mapping file, Christianity is assigned a higher priority than Early Christianity, * and so Early should not be marked as RELIGION. */ public void testPriority() throws Exception { String str = "Christianity is of higher regex priority than Early Christianity . "; Annotation document = createDocument(str); annotator.annotate(document); List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class); checkNerTags(tokens, "RELIGION", "O", "O", "O", "O", "O", "O", "O", "RELIGION", "O"); } /** * Test that if there are no annotations at all, the annotator * throws an exception. We are happy if we can catch an exception * and continue, and if we don't get any exceptions, we throw an * exception of our own. */ public void testEmptyAnnotation() throws Exception { try { annotator.annotate(new Annotation("")); } catch(RuntimeException e) { return; } fail("Never expected to get this far... the annotator should have thrown an exception by now"); } }