package edu.stanford.nlp.pipeline; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.util.CoreMap; import junit.framework.TestCase; import java.util.List; import java.util.Properties; /** * * @author mcdm */ public class QuantifiableEntityNormalizingAnnotatorITest extends TestCase { private static AnnotationPipeline pipeline; public void setUp() throws Exception { synchronized(QuantifiableEntityNormalizingAnnotatorITest.class) { Properties props = new Properties(); props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner"); pipeline = new StanfordCoreNLP(props); } } public void testQuantifiableEntityNormalizingAnnotator() throws Exception { Annotation document = new Annotation(text); pipeline.annotate(document); int i = 0; for (CoreMap sentence: document.get(CoreAnnotations.SentencesAnnotation.class)) { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); for (CoreLabel token : tokens) { System.out.println(token.get(CoreAnnotations.TextAnnotation.class) + ": " + token.get(CoreAnnotations.NamedEntityTagAnnotation.class) + ", " + token.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class)); } for (int j = 0; j < tokens.size(); j++){ String normalization = tokens.get(j).get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class); if (normalization != null) { assertEquals(answer_text[i],tokens.get(j).get(CoreAnnotations.OriginalTextAnnotation.class)); assertEquals(answer_time[i],normalization); i++; } } } assertEquals(answer_text.length, i); assertEquals(answer_time.length, i); } static final String text = "On January 3 1980, Ellinais used the 2nd century A.D. temple of Zeus in Athens to stage the first known ceremony of the kind since the late 4th century."; /* For the record: values without SUTime static final String[] answer = { "19800103","19800103","19800103", // same normalization for every token in the entity "2*****02","2*****02", // actually wrong! it should catch A.D. as well "1.0", "******04","******04","******04","******04", // TODO: was "4*****04". why? }; */ // With SUTime static final String[] answer_text = { "January","3","1980", // same normalization for every token in the entity "the","2nd", "century", "A.D.", "first", "the","late", "4th", "century" }; static final String[] answer_time = { "1980-01-03","1980-01-03","1980-01-03", // same normalization for every token in the entity "01XX","01XX", "01XX", "01XX", "1.0", //"P100Y-#4", "P100Y-#4", "P100Y-#4", "P100Y-#4", "03XX","03XX", "03XX", "03XX" }; }