package edu.stanford.nlp.simple; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; import org.junit.Test; import java.util.ArrayList; import java.util.List; import java.util.Properties; import static org.junit.Assert.*; /** * A test for aspects of {@link edu.stanford.nlp.simple.Sentence} which do not require loading the NLP models. * * @author Gabor Angeli */ public class SentenceTest { @Test public void testCreateFromText() { Sentence sent = new Sentence("the quick brown fox jumped over the lazy dog"); assertNotNull(sent); } @Test public void testText() { Sentence sent = new Sentence("the quick brown fox jumped over the lazy dog"); assertEquals("the quick brown fox jumped over the lazy dog", sent.text()); } @Test public void testLength() { Sentence sent = new Sentence("the quick brown fox jumped over the lazy dog"); assertEquals(9, sent.length()); } @Test public void testDocumentLinking() { Sentence sent = new Sentence("the quick brown fox jumped over the lazy dog"); assertEquals(sent, sent.document.sentence(0)); } @Test public void testBasicTokenization() { Sentence sent = new Sentence("the quick brown fox jumped over the lazy dog."); assertEquals("the", sent.word(0)); assertEquals("quick", sent.word(1)); assertEquals("dog", sent.word(8)); assertEquals(".", sent.word(9)); } @Test public void testWeirdTokens() { Sentence sent = new Sentence("United States of America (USA) it's a country."); assertEquals("-LRB-", sent.word(4)); assertEquals("-RRB-", sent.word(6)); assertEquals("'s", sent.word(8)); } @Test public void testOriginalText() { Sentence sent = new Sentence("United States of America (USA) it's a country."); assertEquals("(", sent.originalText(4)); assertEquals(")", sent.originalText(6)); assertEquals("it", sent.originalText(7)); assertEquals("'s", sent.originalText(8)); } @Test public void testCharacterOffsets() { Sentence sent = new Sentence("United States of America (USA) it's a country."); assertEquals(0, sent.characterOffsetBegin(0)); assertEquals(6, sent.characterOffsetEnd(0)); assertEquals(7, sent.characterOffsetBegin(1)); assertEquals(25, sent.characterOffsetBegin(4)); assertEquals(26, sent.characterOffsetEnd(4)); } @Test public void testSentenceIndex() { Sentence sent = new Sentence("the quick brown fox jumped over the lazy dog"); assertEquals(0, sent.sentenceIndex()); Document doc = new Document("the quick brown fox jumped over the lazy dog. The lazy dog was not impressed."); List<Sentence> sentences = doc.sentences(); assertEquals(0, sentences.get(0).sentenceIndex()); assertEquals(1, sentences.get(1).sentenceIndex()); } @Test public void testSentenceTokenOffsets() { Sentence sent = new Sentence("the quick brown fox jumped over the lazy dog"); assertEquals(0, sent.sentenceTokenOffsetBegin()); Document doc = new Document("the quick brown fox jumped over the lazy dog. The lazy dog was not impressed."); List<Sentence> sentences = doc.sentences(); assertEquals(0, sentences.get(0).sentenceTokenOffsetBegin()); assertEquals(10, sentences.get(0).sentenceTokenOffsetEnd()); assertEquals(10, sentences.get(1).sentenceTokenOffsetBegin()); assertEquals(17, sentences.get(1).sentenceTokenOffsetEnd()); } @Test public void testFromCoreMapCrashCheck() { StanfordCoreNLP pipeline = new StanfordCoreNLP(new Properties(){{ setProperty("annotators", "tokenize,ssplit"); }}); Annotation ann = new Annotation("This is a sentence."); pipeline.annotate(ann); CoreMap map = ann.get(CoreAnnotations.SentencesAnnotation.class).get(0); new Sentence(map); } @Test public void testFromCoreMapCorrectnessCheck() { StanfordCoreNLP pipeline = new StanfordCoreNLP(new Properties(){{ setProperty("annotators", "tokenize,ssplit"); }}); Annotation ann = new Annotation("This is a sentence."); pipeline.annotate(ann); CoreMap map = ann.get(CoreAnnotations.SentencesAnnotation.class).get(0); Sentence s = new Sentence(map); assertEquals(ann.get(CoreAnnotations.TextAnnotation.class), s.text()); assertEquals("This", s.word(0)); assertEquals(5, s.length()); } @Test public void testTokenizeWhitespaceSimple() { Sentence s = new Sentence(new ArrayList<String>(){{add("foo"); add("bar");}}); assertEquals("foo", s.word(0)); assertEquals("bar", s.word(1)); } @Test public void testTokenizeWhitespaceWithSpaces() { Sentence s = new Sentence(new ArrayList<String>(){{add("foo"); add("with whitespace"); add("baz");}}); assertEquals("foo", s.word(0)); assertEquals("with whitespace", s.word(1)); assertEquals("baz", s.word(2)); } }