/*
* Copyright 2015
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.tokit;
import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertSentence;
import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertToken;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription;
import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.selectCovered;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.collection.CollectionReaderDescription;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.fit.pipeline.SimplePipeline;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.junit.Test;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.io.text.StringReader;
public class RegexSegmenterTest
{
@Test
public void simpleExample()
throws Exception
{
// NOTE: This file contains Asciidoc markers for partial inclusion of this file in the
// documentation. Do not remove these tags!
// tag::example[]
JCas jcas = JCasFactory.createText("This is sentence 1 .\nThis is number 2 .", "en");
runPipeline(jcas,
createEngineDescription(RegexSegmenter.class,
// Treat each line as a sentence
RegexSegmenter.PARAM_SENTENCE_BOUNDARY_REGEX, "\n",
// Use whitespace to detect tokens
RegexSegmenter.PARAM_TOKEN_BOUNDARY_REGEX, "\\s+"));
for (Sentence s : select(jcas, Sentence.class)) {
for (Token t : selectCovered(Token.class, s)) {
System.out.printf("[%s] ", t.getCoveredText());
}
System.out.println();
}
// end::example[]
assertToken(
new String[] { "This", "is", "sentence", "1", ".", "This", "is", "number", "2", "." },
select(jcas, Token.class));
assertSentence(
new String[] {
"This is sentence 1 .",
"This is number 2 ." },
select(jcas, Sentence.class));
}
@Test
public void testWhitespace()
throws ResourceInitializationException
{
String text = "This is a tokenized text .";
String[] expectedSentences = new String[] { "This is a tokenized text ." };
String[] expectedTokens = new String[] { "This", "is", "a", "tokenized", "text", "." };
CollectionReaderDescription reader = createReaderDescription(StringReader.class,
StringReader.PARAM_DOCUMENT_TEXT, text,
StringReader.PARAM_LANGUAGE, "en");
AnalysisEngineDescription segmenter = createEngineDescription(RegexSegmenter.class);
for (JCas jcas : SimplePipeline.iteratePipeline(reader, segmenter)) {
assertSentence(expectedSentences, select(jcas, Sentence.class));
assertToken(expectedTokens, select(jcas, Token.class));
}
}
@Test
public void testWhitespaceTwoLines()
throws ResourceInitializationException
{
String text = "This is a tokenized text .\nAnother line with tokens .";
String[] expectedTokens = new String[] { "This", "is", "a", "tokenized", "text", ".",
"Another", "line", "with", "tokens", "." };
String[] expectedSentences = new String[] { "This is a tokenized text .",
"Another line with tokens ." };
CollectionReaderDescription reader = createReaderDescription(StringReader.class,
StringReader.PARAM_DOCUMENT_TEXT, text,
StringReader.PARAM_LANGUAGE, "en");
AnalysisEngineDescription segmenter = createEngineDescription(RegexSegmenter.class);
for (JCas jcas : SimplePipeline.iteratePipeline(reader, segmenter)) {
assertSentence(expectedSentences, select(jcas, Sentence.class));
assertToken(expectedTokens, select(jcas, Token.class));
}
}
@Test
public void testWhitespaceNoPunctuation()
throws ResourceInitializationException
{
String text = "This is a tokenized text";
String[] expectedSentences = new String[] { "This is a tokenized text" };
String[] expectedTokens = new String[] { "This", "is", "a", "tokenized", "text" };
CollectionReaderDescription reader = createReaderDescription(StringReader.class,
StringReader.PARAM_DOCUMENT_TEXT, text,
StringReader.PARAM_LANGUAGE, "en");
AnalysisEngineDescription segmenter = createEngineDescription(RegexSegmenter.class);
for (JCas jcas : SimplePipeline.iteratePipeline(reader, segmenter)) {
assertSentence(expectedSentences, select(jcas, Sentence.class));
assertToken(expectedTokens, select(jcas, Token.class));
}
}
@Test
public void testTrailingWhitespace()
throws ResourceInitializationException
{
String text = "This is a tokenized text ";
String[] expectedSentences = new String[] { "This is a tokenized text " };
String[] expectedTokens = new String[] { "This", "is", "a", "tokenized", "text" };
CollectionReaderDescription reader = createReaderDescription(StringReader.class,
StringReader.PARAM_DOCUMENT_TEXT, text,
StringReader.PARAM_LANGUAGE, "en");
AnalysisEngineDescription segmenter = createEngineDescription(RegexSegmenter.class);
for (JCas jcas : SimplePipeline.iteratePipeline(reader, segmenter)) {
assertSentence(expectedSentences, select(jcas, Sentence.class));
assertToken(expectedTokens, select(jcas, Token.class));
}
}
@Test
public void testWhitespacePunctuation()
throws ResourceInitializationException
{
String text = "This , is a tokenized text , with a final period .";
String[] expectedSentences = new String[] {
"This , is a tokenized text , with a final period ." };
String[] expectedTokens = new String[] { "This", ",", "is", "a", "tokenized", "text", ",",
"with", "a", "final", "period", "." };
CollectionReaderDescription reader = createReaderDescription(StringReader.class,
StringReader.PARAM_DOCUMENT_TEXT, text,
StringReader.PARAM_LANGUAGE, "en");
AnalysisEngineDescription segmenter = createEngineDescription(RegexSegmenter.class);
for (JCas jcas : SimplePipeline.iteratePipeline(reader, segmenter)) {
assertSentence(expectedSentences, select(jcas, Sentence.class));
assertToken(expectedTokens, select(jcas, Token.class));
}
}
@Test
public void testRegex()
throws ResourceInitializationException
{
String text = "This-is-a-text-.";
String regex = "[-\n]";
String[] expectedSentences = new String[] { "This-is-a-text-." };
String[] expectedTokens = new String[] { "This", "is", "a", "text", "." };
CollectionReaderDescription reader = createReaderDescription(StringReader.class,
StringReader.PARAM_DOCUMENT_TEXT, text,
StringReader.PARAM_LANGUAGE, "en");
AnalysisEngineDescription segmenter = createEngineDescription(RegexSegmenter.class,
RegexSegmenter.PARAM_TOKEN_BOUNDARY_REGEX, regex);
for (JCas jcas : SimplePipeline.iteratePipeline(reader, segmenter)) {
assertSentence(expectedSentences, select(jcas, Sentence.class));
assertToken(expectedTokens, select(jcas, Token.class));
}
}
}