/* LanguageTool, a natural language style checker * Copyright (C) 2013 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.chunking; import org.junit.Test; import org.languagetool.AnalyzedSentence; import org.languagetool.AnalyzedToken; import org.languagetool.AnalyzedTokenReadings; import org.languagetool.JLanguageTool; import org.languagetool.language.English; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.StringTokenizer; import static org.hamcrest.CoreMatchers.is; import static org.junit.Assert.assertThat; public class EnglishChunkerTest { @Test public void testAddChunkTags() throws Exception { EnglishChunker chunker = new EnglishChunker(); List<AnalyzedTokenReadings> readingsList = createReadingsList("A short test of the bicycle is needed"); chunker.addChunkTags(readingsList); assertThat(readingsList.size(), is(15)); // "A short test": assertThat(readingsList.get(0).getChunkTags().toString(), is("[B-NP-singular]")); assertThat(readingsList.get(2).getChunkTags().toString(), is("[I-NP-singular]")); assertThat(readingsList.get(4).getChunkTags().toString(), is("[E-NP-singular]")); // "the chunker": assertThat(readingsList.get(8).getChunkTags().toString(), is("[B-NP-singular]")); assertThat(readingsList.get(10).getChunkTags().toString(), is("[E-NP-singular]")); // "is" assertThat(readingsList.get(12).getChunkTags().toString(), is("[B-VP]")); assertThat(readingsList.get(14).getChunkTags().toString(), is("[I-VP]")); } @Test public void testAddChunkTagsSingular() throws Exception { EnglishChunker chunker = new EnglishChunker(); JLanguageTool lt = new JLanguageTool(new English()); List<AnalyzedSentence> sentences = lt.analyzeText("The abacus shows how numbers can be stored"); List<AnalyzedTokenReadings> readingsList = Arrays.asList(sentences.get(0).getTokens()); chunker.addChunkTags(readingsList); // "The abacus": assertThat(readingsList.get(1).getChunkTags().toString(), is("[B-NP-singular]")); assertThat(readingsList.get(3).getChunkTags().toString(), is("[E-NP-singular]")); // "numbers": assertThat(readingsList.get(9).getChunkTags().toString(), is("[B-NP-plural, E-NP-plural]")); } @Test public void testContractions() throws Exception { JLanguageTool langTool = new JLanguageTool(new English()); AnalyzedSentence analyzedSentence = langTool.getAnalyzedSentence("I'll be there"); AnalyzedTokenReadings[] tokens = analyzedSentence.getTokens(); assertThat(tokens[1].getChunkTags().get(0), is(new ChunkTag("B-NP-singular"))); assertThat(tokens[2].getChunkTags().size(), is(0)); // "'" cannot be mapped as we tokenize differently assertThat(tokens[3].getChunkTags().size(), is(0)); // "ll" cannot be mapped as we tokenize differently assertThat(tokens[5].getChunkTags().get(0), is(new ChunkTag("I-VP"))); } @Test public void testTokenize() throws Exception { EnglishChunker chunker = new EnglishChunker(); String expected = "[I, 'm, going, to, London]"; assertThat(Arrays.toString(chunker.tokenize("I'm going to London")), is(expected)); assertThat(Arrays.toString(chunker.tokenize("I’m going to London")), is(expected)); // different apostrophe char } private List<AnalyzedTokenReadings> createReadingsList(String sentence) { StringTokenizer tokenizer = new StringTokenizer(sentence, " ", true); List<AnalyzedTokenReadings> result = new ArrayList<>(); int pos = 0; while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); if (token.trim().isEmpty()) { result.add(new AnalyzedTokenReadings(new AnalyzedToken(token, null, null), pos)); } else { result.add(new AnalyzedTokenReadings(new AnalyzedToken(token, "fake", "fake"), pos)); } pos += token.length(); } return result; } }