package edu.stanford.nlp.international.spanish; import edu.stanford.nlp.international.spanish.process.SpanishTokenizer; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.process.Tokenizer; import edu.stanford.nlp.process.TokenizerFactory; import junit.framework.TestCase; import java.io.StringReader; import java.util.List; /** * Needs to be an "itest" because the VerbStripper loads data from the models jar. * * @author Ishita Prasad */ public class SpanishTokenizerITest extends TestCase { private final String[] spanishInputs = { "Esta es una oración.", "¡Dímelo!", "Hazlo.", "Este es un címbalo.", "Metelo.", "Sentémonos.", "Escribámosela.", "No comamos allí.", "Comamosla.", "sub-20", "un teléfono (902.400.345).", "Port-au-Prince", "McLaren/Mercedes", "10/12", "4X4", "3G", "3g", "sp3", "12km", "12km/h", "Los hombres sentados están muy guapos.", "Hizo abrirlos.", "salos ) ( 1 de", }; private final String[][] spanishGold = { { "Esta", "es", "una", "oración", "." }, { "¡", "Di", "me", "lo", "!" }, { "Haz", "lo", "." }, { "Este", "es", "un", "címbalo", "." }, { "Mete", "lo", "." }, { "Sentemos", "nos", "." }, { "Escribamos", "se", "la", "." }, { "No", "comamos", "allí", "." }, { "Comamos", "la", "." }, { "sub-20" }, { "un", "teléfono", "=LRB=", "902.400.345", "=RRB=", "." }, { "Port", "-", "au", "-", "Prince" }, { "McLaren", "/", "Mercedes" }, { "10/12" }, { "4X4" }, { "3G" }, { "3g" }, { "sp3" }, { "12", "km" }, { "12", "km", "/", "h" }, { "Los", "hombres", "sentados", "están", "muy", "guapos", "." }, { "Hizo", "abrir", "los", "." }, { "salos", "=RRB=", "=LRB=", "1", "de" }, }; private static void runSpanish(TokenizerFactory<CoreLabel> tf, String[] inputs, String[][] gold) { for (int sent = 0; sent < inputs.length; sent++) { Tokenizer<CoreLabel> spanishTokenizer = tf.getTokenizer(new StringReader(inputs[sent])); int i = 0; while (spanishTokenizer.hasNext()) { String w = spanishTokenizer.next().word(); try { assertEquals("SpanishTokenizer problem", gold[sent][i], w); } catch (ArrayIndexOutOfBoundsException aioobe) { // the assertion below outside the loop will fail } i++; } assertEquals("SpanishTokenizer num tokens problem", i, gold[sent].length); } } public void testSpanishTokenizerWord() { assert spanishInputs.length == spanishGold.length; final TokenizerFactory<CoreLabel> tf = SpanishTokenizer.ancoraFactory(); tf.setOptions(""); tf.setOptions("tokenizeNLs"); runSpanish(tf, spanishInputs, spanishGold); } /** Makes a Spanish tokenizer with the options that CoreNLP uses. Results actually no different.... */ public void testSpanishTokenizerCoreNLP() { assert spanishInputs.length == spanishGold.length; final TokenizerFactory<CoreLabel> tf = SpanishTokenizer.coreLabelFactory(); tf.setOptions(""); tf.setOptions("invertible,ptb3Escaping=true,splitAll=true"); runSpanish(tf, spanishInputs, spanishGold); } public void testOffsetsSpacing() { // guide 1 2 3 4 5 6 7 8 9 0 1 2 3 // guide 0123456789012345678901234567890123456789012345678 90123456789012345678901234567 8 901234567890123456789012345678901234567890123456789012345 String text = " La combinación consonántica ss es ajena a la\tortografía castellana: \n\n traigámosela, mandémoselos, escribámosela, comprémoselo."; final TokenizerFactory<CoreLabel> tf = SpanishTokenizer.coreLabelFactory(); tf.setOptions(""); tf.setOptions("splitAll=true"); Tokenizer<CoreLabel> spanishTokenizer = tf.getTokenizer(new StringReader(text)); List<CoreLabel> tokens = spanishTokenizer.tokenize(); System.err.println(tokens); assertEquals(27, tokens.size()); // assertEquals(" ", tokens.get(0).get(CoreAnnotations.BeforeAnnotation.class)); // assertEquals("\t", tokens.get(8).get(CoreAnnotations.AfterAnnotation.class)); assertEquals("Begin char offset", 2, (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)); assertEquals("End char offset", 4, (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); assertEquals("La", tokens.get(0).get(CoreAnnotations.OriginalTextAnnotation.class)); // note: after(x) and before(x+1) are the same // assertEquals(" ", tokens.get(0).get(CoreAnnotations.AfterAnnotation.class)); // assertEquals(" ", tokens.get(1).get(CoreAnnotations.BeforeAnnotation.class)); assertEquals("escribámo", tokens.get(19).get(CoreAnnotations.OriginalTextAnnotation.class)); assertEquals("escribamos", tokens.get(19).get(CoreAnnotations.TextAnnotation.class)); assertEquals("Begin char offset", 108, (int) tokens.get(19).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)); assertEquals("End char offset", 117, (int) tokens.get(19).get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); assertEquals("se", tokens.get(20).get(CoreAnnotations.OriginalTextAnnotation.class)); assertEquals("se", tokens.get(20).get(CoreAnnotations.TextAnnotation.class)); assertEquals("Begin char offset", 117, (int) tokens.get(20).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)); assertEquals("End char offset", 119, (int) tokens.get(20).get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); assertEquals("la", tokens.get(21).get(CoreAnnotations.OriginalTextAnnotation.class)); assertEquals("la", tokens.get(21).get(CoreAnnotations.TextAnnotation.class)); assertEquals("Begin char offset", 119, (int) tokens.get(21).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)); assertEquals("End char offset", 121, (int) tokens.get(21).get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); assertEquals(",", tokens.get(22).get(CoreAnnotations.OriginalTextAnnotation.class)); assertEquals(",", tokens.get(22).get(CoreAnnotations.TextAnnotation.class)); assertEquals("Begin char offset", 121, (int) tokens.get(22).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)); assertEquals("End char offset", 122, (int) tokens.get(22).get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); } private void testOffset(String input, int[] beginOffsets, int[] endOffsets) { TokenizerFactory<CoreLabel> tf = SpanishTokenizer.ancoraFactory(); Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new StringReader(input)); List<CoreLabel> tokens = tokenizer.tokenize(); assertEquals("Number of tokens doesn't match reference '" + input + "'", beginOffsets.length, tokens.size()); for (int i = 0; i < beginOffsets.length; i++) { assertEquals("Char begin offset of word " + i + " deviates from reference '" + input + "'", beginOffsets[i], tokens.get(i).beginPosition()); assertEquals("Char end offset of word " + i + " deviates from reference '" + input + "'", endOffsets[i], tokens.get(i).endPosition()); } } public void testCliticPronounOffset() { // will be tokenized into "tengo que decir te algo" testOffset("tengo que decirte algo", new int[]{0, 6, 10, 15, 18}, new int[]{5, 9, 15, 17, 22}); } public void testIr() { // "ir" is a special case -- it is a verb ending without a stem! testOffset("tengo que irme ahora", new int[] {0, 6, 10, 12, 15}, new int[] {5, 9, 12, 14, 20}); } public void testContractionOffsets() { // y de el y testOffset("y del y", new int[] {0, 2, 3, 6}, new int[] {1, 3, 5, 7}); // y a el y testOffset("y al y", new int[] {0, 2, 3, 5}, new int[] {1, 3, 4, 6}); // y con mí y testOffset("y conmigo y", new int[] {0, 2, 5, 10}, new int[] {1, 5, 9, 11}); } public void testCompoundOffset() { testOffset("y abc-def y", new int[] {0, 2, 5, 6, 10}, new int[] {1, 5, 6, 9, 11}); testOffset("y abc - def y", new int[] {0, 2, 6, 8, 12}, new int[] {1, 5, 7, 11, 13}); } }