package edu.stanford.nlp.pipeline; import java.util.*; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import junit.framework.TestCase; /** * Tests a couple tokenizer options, such as working with Spanish. * See TokenizerAnnotatorTest for more tests. * * @author John Bauer */ public class TokenizerAnnotatorITest extends TestCase { public void testNotSpanish() { Annotation ann = new Annotation("Damelo"); Properties props = new Properties(); props.setProperty("annotators", "tokenize"); props.setProperty("tokenize.language", "english"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.annotate(ann); assertEquals(1, ann.get(CoreAnnotations.TokensAnnotation.class).size()); assertEquals("Damelo", ann.get(CoreAnnotations.TokensAnnotation.class).get(0).word()); } private static final String spanishText = "Me voy a Madrid (ES).\n\"Me gusta\", lo dice."; private static List<String> spanishTokens = Arrays.asList(new String[] { "Me", "voy", "a", "Madrid", "=LRB=", "ES", "=RRB=", ".", "\"", "Me", "gusta", "\"", ",", "lo", "dice", "." }); private static final String spanishText2 = "Me voy a Madrid (ES).\n(Me gusta), lo dice."; private static List<String> spanishTokens2 = Arrays.asList(new String[] { "Me", "voy", "a", "Madrid", "=LRB=", "ES", "=RRB=", ".", "*NL*", "\"", "Me", "gusta", "\"", ",", "lo", "dice", "." }); public void testSpanishTokenizer() { TokenizerAnnotator annotator = new TokenizerAnnotator(false, "es", null); Annotation annotation = new Annotation(spanishText); annotator.annotate(annotation); List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class); assertEquals(spanishTokens.size(), tokens.size()); for (int i = 0; i < tokens.size(); ++i) { assertEquals(spanishTokens.get(i), tokens.get(i).value()); } annotator = new TokenizerAnnotator(false, "es", "tokenizeNLs,"); annotation = new Annotation(spanishText); annotator.annotate(annotation); tokens = annotation.get(CoreAnnotations.TokensAnnotation.class); assertEquals(spanishTokens2.size(), tokens.size()); for (int i = 0; i < tokens.size(); ++i) { assertEquals(spanishTokens2.get(i), tokens.get(i).value()); } } }