package edu.stanford.nlp.pipeline; import java.util.*; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import junit.framework.TestCase; /** * See TokenizerAnnotatorITest for some tests that require model files. * See PTBTokenizerTest, etc. for more detailed language-specific tests. * * @author Christopher Manning */ public class TokenizerAnnotatorTest extends TestCase { private static final String text = "She'll prove it ain't so."; private static List<String> tokenWords = Arrays.asList( "She", "'ll", "prove", "it", "ai", "n't", "so", "."); public void testNewVersion() { Annotation ann = new Annotation(text); Annotator annotator = new TokenizerAnnotator("en"); annotator.annotate(ann); Iterator<String> it = tokenWords.iterator(); for (CoreLabel word : ann.get(CoreAnnotations.TokensAnnotation.class)) { assertEquals("Bung token in new CoreLabel usage", it.next(), word.word()); } assertFalse("Too few tokens in new CoreLabel usage", it.hasNext()); Iterator<String> it2 = tokenWords.iterator(); for (CoreLabel word : ann.get(CoreAnnotations.TokensAnnotation.class)) { assertEquals("Bung token in new CoreLabel usage", it2.next(), word.get(CoreAnnotations.TextAnnotation.class)); } assertFalse("Too few tokens in new CoreLabel usage", it2.hasNext()); } public void testBadLanguage() { Properties props = new Properties(); props.setProperty("annotators", "tokenize"); props.setProperty("tokenize.language", "notalanguage"); try { new StanfordCoreNLP(props); throw new RuntimeException("Should have failed"); } catch (IllegalArgumentException e) { // yay, passed } } public void testDefaultNoNLsPipeline() { String t = "Text with \n\n a new \nline."; List<String> tWords = Arrays.asList( "Text", "with", "a", "new", "line", "."); Properties props = new Properties(); props.setProperty("annotators", "tokenize"); Annotation ann = new Annotation(t); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.annotate(ann); Iterator<String> it = tWords.iterator(); for (CoreLabel word : ann.get(CoreAnnotations.TokensAnnotation.class)) { assertEquals("Bung token in new CoreLabel usage", it.next(), word.word()); } assertFalse("Too few tokens in new CoreLabel usage", it.hasNext()); Iterator<String> it2 = tWords.iterator(); for (CoreLabel word : ann.get(CoreAnnotations.TokensAnnotation.class)) { assertEquals("Bung token in new CoreLabel usage", it2.next(), word.get(CoreAnnotations.TextAnnotation.class)); } assertFalse("Too few tokens in new CoreLabel usage", it2.hasNext()); } public void testHyphens() { String test = "Hyphen-ated words should be split except when school-aged-children eat " + "anti-disestablishmentariansm for breakfast at the o-kay choral infront of some explor-o-toriums."; Properties props = new Properties(); props.setProperty("annotators", "tokenize"); Annotation ann = new Annotation(test); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.annotate(ann); List<CoreLabel> toks = ann.get(CoreAnnotations.TokensAnnotation.class); assertEquals(21, toks.size()); Properties props2 = new Properties(); props2.setProperty("annotators", "tokenize"); props2.setProperty("tokenize.options", "splitHyphenated=true"); Annotation ann2 = new Annotation(test); StanfordCoreNLP pipeline2 = new StanfordCoreNLP(props2); pipeline2.annotate(ann2); List<CoreLabel> toks2 = ann2.get(CoreAnnotations.TokensAnnotation.class); assertEquals(27, toks2.size()); } }