/*
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
* <p>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import org.apache.uima.UIMAException;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.junit.Assert;
import org.junit.Test;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.List;
import static org.junit.Assert.assertEquals;
public class StringSequenceGeneratorTest
{
/**
* Generate a JCas with two token annotations.
*
* @return a {@link JCas} with two tokens.
* @throws UIMAException
*/
protected static JCas jCasWithTokens()
throws UIMAException
{
JCas jCas = JCasFactory.createJCas();
jCas.setDocumentText("Token1 Token2");
DocumentMetaData metaData = DocumentMetaData.create(jCas);
metaData.setDocumentId("tokensTest");
metaData.addToIndexes(jCas);
Token token1 = new Token(jCas, 0, 6);
Token token2 = new Token(jCas, 7, 13);
token1.addToIndexes(jCas);
token2.addToIndexes(jCas);
return jCas;
}
/**
* Generate a JCas with two tokens and two lemmas.
*
* @return a {@link JCas}
* @throws UIMAException
*/
protected static JCas jCasWithLemmas()
throws UIMAException
{
JCas jCas = JCasFactory.createJCas();
jCas.setDocumentText("token1 token2");
DocumentMetaData metaData = DocumentMetaData.create(jCas);
metaData.setDocumentId("lemmasTest");
metaData.addToIndexes(jCas);
Token token1 = new Token(jCas, 0, 6);
Token token2 = new Token(jCas, 7, 13);
Lemma lemma1 = new Lemma(jCas, 0, 6);
lemma1.setValue("lemma1");
Lemma lemma2 = new Lemma(jCas, 7, 13);
lemma2.setValue("lemma2");
token1.setLemma(lemma1);
token2.setLemma(lemma2);
token1.addToIndexes(jCas);
token2.addToIndexes(jCas);
lemma1.addToIndexes(jCas);
lemma2.addToIndexes(jCas);
return jCas;
}
protected static JCas jcasWithNamedEntity()
throws UIMAException
{
JCas jCas = JCasFactory.createJCas();
jCas.setDocumentText("token1 token2");
DocumentMetaData metaData = DocumentMetaData.create(jCas);
metaData.setDocumentId("lemmasTest");
metaData.addToIndexes(jCas);
Token token1 = new Token(jCas, 0, 6);
Token token2 = new Token(jCas, 7, 13);
NamedEntity ne = new NamedEntity(jCas, 0, 6);
ne.setValue("TEST");
ne.addToIndexes(jCas);
token1.addToIndexes(jCas);
token2.addToIndexes(jCas);
return jCas;
}
/**
* Create a JCas with one sentence.
*
* @return a {@link JCas} with a sentence annotation.
* @throws UIMAException
* @see #jCasWithTokens()
*/
protected static JCas jCasWithSentence()
throws UIMAException
{
JCas jCas = jCasWithTokens();
Sentence sentence = new Sentence(jCas, 0, 13);
sentence.addToIndexes();
return jCas;
}
@Test
public void testGenerateSequenceFeaturePath()
throws FeaturePathException, UIMAException, IOException
{
String featurePath = Token.class.getName();
int expectedSize = 2;
String expectedFirstToken = "Token1";
String expectedLastToken = "Token2";
JCas jCas = jCasWithTokens();
StringSequenceGenerator sequenceGenerator = new PhraseSequenceGenerator.Builder()
.featurePath(featurePath)
.buildStringSequenceGenerator();
String[] sequence = sequenceGenerator.tokenSequences(jCas).get(0);
assertEquals(expectedSize, sequence.length);
assertEquals(expectedFirstToken, sequence[0]);
assertEquals(expectedLastToken, sequence[sequence.length - 1]);
}
@Test
public void testGenerateSequenceFeaturePathLowercase()
throws FeaturePathException, UIMAException, IOException
{
String featurePath = Token.class.getName();
int expectedSize = 2;
String expectedFirstToken = "token1";
String expectedLastToken = "token2";
JCas jCas = jCasWithTokens();
StringSequenceGenerator sequenceGenerator = new PhraseSequenceGenerator.Builder()
.featurePath(featurePath)
.lowercase(true)
.buildStringSequenceGenerator();
String[] sequence = sequenceGenerator.tokenSequences(jCas).get(0);
assertEquals(expectedSize, sequence.length);
assertEquals(expectedFirstToken, sequence[0]);
assertEquals(expectedLastToken, sequence[sequence.length - 1]);
}
@Test
public void testGenerateSequenceFeaturePathLemmas()
throws UIMAException, FeaturePathException, IOException
{
String featurePath = Token.class.getName() + "/lemma/value";
int expectedSize = 2;
String expectedFirstLemma = "lemma1";
String expectedLastLemma = "lemma2";
JCas jCas = jCasWithLemmas();
StringSequenceGenerator sequenceGenerator = new PhraseSequenceGenerator.Builder()
.featurePath(featurePath)
.buildStringSequenceGenerator();
String[] sequence = sequenceGenerator.tokenSequences(jCas).get(0);
assertEquals(expectedSize, sequence.length);
assertEquals(expectedFirstLemma, sequence[0]);
assertEquals(expectedLastLemma, sequence[sequence.length - 1]);
}
@Test
public void testFeaturePathNamedEntities()
throws UIMAException, IOException, FeaturePathException
{
String featurePath = NamedEntity.class.getCanonicalName();
int expectedSize = 1;
String expectedNamedEntity = "token1";
JCas jCas = jcasWithNamedEntity();
StringSequenceGenerator sequenceGenerator = new PhraseSequenceGenerator.Builder()
.featurePath(featurePath)
.buildStringSequenceGenerator();
String[] sequence = sequenceGenerator.tokenSequences(jCas).get(0);
assertEquals(expectedSize, sequence.length);
assertEquals(expectedNamedEntity, sequence[0]);
}
@Test
public void testGenerateSequenceFeaturePathCovering()
throws FeaturePathException, UIMAException, IOException
{
String featurePath = Token.class.getName();
int expectedSize = 2;
String expectedFirstToken = "Token1";
String expectedLastToken = "Token2";
String covering = Sentence.class.getTypeName();
JCas jCas = jCasWithSentence();
StringSequenceGenerator sequenceGenerator = new PhraseSequenceGenerator.Builder()
.featurePath(featurePath)
.lowercase(false)
.coveringType(covering)
.buildStringSequenceGenerator();
List<String[]> sequences = sequenceGenerator.tokenSequences(jCas);
assertEquals(1, sequences.size());
String[] sequence = sequences.get(0);
Assert.assertEquals(expectedSize, sequence.length);
Assert.assertEquals(expectedFirstToken, sequence[0]);
Assert.assertEquals(expectedLastToken, sequence[sequence.length - 1]);
}
@Test
public void testFilterRegex()
throws UIMAException, IOException, FeaturePathException
{
JCas jCas = jCasWithTokens();
String filterRegex = ".*1";
int expectedSize = 1;
String expectedToken = "Token2";
StringSequenceGenerator sequenceGenerator = new PhraseSequenceGenerator.Builder()
.filterRegex(filterRegex)
.buildStringSequenceGenerator();
List<String[]> sequences = sequenceGenerator.tokenSequences(jCas);
assertEquals(1, sequences.size());
String[] sequence = sequences.get(0);
assertEquals(expectedSize, sequence.length);
assertEquals(expectedToken, sequence[0]);
}
@Test
public void testFilterRegexReplace()
throws UIMAException, IOException, FeaturePathException
{
JCas jCas = jCasWithTokens();
String filterRegex = ".*1";
String replacement = "REPLACED";
int expectedSize = 2;
String expectedToken2 = "Token2";
StringSequenceGenerator sequenceGenerator = new PhraseSequenceGenerator.Builder()
.filterRegex(filterRegex)
.filterRegexReplacement(replacement)
.buildStringSequenceGenerator();
List<String[]> sequences = sequenceGenerator.tokenSequences(jCas);
assertEquals(1, sequences.size());
String[] sequence = sequences.get(0);
assertEquals(expectedSize, sequence.length);
assertEquals(replacement, sequence[0]);
assertEquals(expectedToken2, sequence[1]);
}
@Test
public void testFilterRegexMultiple()
throws UIMAException, IOException, FeaturePathException
{
JCas jCas = jCasWithTokens();
String filterRegex1 = ".*1";
String filterRegex2 = "xyz";
int expectedSize = 1;
String expectedToken = "Token2";
StringSequenceGenerator sequenceGenerator = new PhraseSequenceGenerator.Builder()
.filterRegex(filterRegex1)
.filterRegex(filterRegex2)
.buildStringSequenceGenerator();
List<String[]> sequences = sequenceGenerator.tokenSequences(jCas);
assertEquals(1, sequences.size());
String[] sequence = sequences.get(0);
assertEquals(expectedSize, sequence.length);
assertEquals(expectedToken, sequence[0]);
}
@Test
public void testFilterStopwordsURL()
throws UIMAException, FeaturePathException, IOException
{
JCas jcas = jCasWithTokens();
URL stopwordsFile = this.getClass().getResource("/stopwords.txt");
int expectedSize = 1;
String expectedFirst = "Token2";
StringSequenceGenerator sequenceGenerator = new PhraseSequenceGenerator.Builder()
.stopwordsURL(stopwordsFile)
.buildStringSequenceGenerator();
String[] sequence = sequenceGenerator.tokenSequences(jcas).get(0);
assertEquals(expectedSize, sequence.length);
assertEquals(expectedFirst, sequence[0]);
}
@Test
public void testFilterStopwordsFileString()
throws UIMAException, FeaturePathException, IOException
{
JCas jcas = jCasWithTokens();
String stopwordsFile = "src/test/resources/stopwords.txt";
int expectedSize = 1;
String expectedFirst = "Token2";
StringSequenceGenerator sequenceGenerator = new PhraseSequenceGenerator.Builder()
.stopwordsFile(stopwordsFile)
.buildStringSequenceGenerator();
String[] sequence = sequenceGenerator.tokenSequences(jcas).get(0);
assertEquals(expectedSize, sequence.length);
assertEquals(expectedFirst, sequence[0]);
}
@Test
public void testFilterStopwordsFile()
throws UIMAException, FeaturePathException, IOException
{
JCas jcas = jCasWithTokens();
File stopwordsFile = new File("src/test/resources/stopwords.txt");
int expectedSize = 1;
String expectedFirst = "Token2";
StringSequenceGenerator sequenceGenerator = new PhraseSequenceGenerator.Builder()
.stopwordsFile(stopwordsFile)
.buildStringSequenceGenerator();
String[] sequence = sequenceGenerator.tokenSequences(jcas).get(0);
assertEquals(expectedSize, sequence.length);
assertEquals(expectedFirst, sequence[0]);
}
/* test character sequences*/
@Test
public void testCharacterSequence()
throws UIMAException, FeaturePathException, IOException
{
JCas jcas = jCasWithTokens();
int expectedSize = 13;
String expectedFirst = "T";
String expectedLast = "2";
StringSequenceGenerator sequenceGenerator = new PhraseSequenceGenerator.Builder()
.characters(true)
.buildStringSequenceGenerator();
String[] sequence = sequenceGenerator.tokenSequences(jcas).get(0);
assertEquals(expectedSize, sequence.length);
assertEquals(expectedFirst, sequence[0]);
assertEquals(expectedLast, sequence[expectedSize - 1]);
}
@Test
public void testCharacterSequenceLowercase()
throws UIMAException, FeaturePathException, IOException
{
JCas jcas = jCasWithTokens();
int expectedSize = 13;
String expectedFirst = "t";
String expectedLast = "2";
StringSequenceGenerator sequenceGenerator = new PhraseSequenceGenerator.Builder()
.lowercase(true)
.characters(true)
.buildStringSequenceGenerator();
String[] sequence = sequenceGenerator.tokenSequences(jcas).get(0);
assertEquals(expectedSize, sequence.length);
assertEquals(expectedFirst, sequence[0]);
assertEquals(expectedLast, sequence[expectedSize - 1]);
}
@Test
public void testCharacterSequenceWithCovering()
throws UIMAException, FeaturePathException, IOException
{
String covering = Sentence.class.getTypeName();
JCas jCas = jCasWithSentence();
int expectedSequences = 1;
int expectedSize = 13;
String expectedFirst = "T";
String expectedLast = "2";
StringSequenceGenerator sequenceGenerator = new PhraseSequenceGenerator.Builder()
.coveringType(covering)
.characters(true)
.buildStringSequenceGenerator();
List<String[]> sequences = sequenceGenerator.tokenSequences(jCas);
assertEquals(expectedSequences, sequences.size());
String[] sequence = sequences.get(0);
assertEquals(expectedSize, sequence.length);
assertEquals(expectedFirst, sequence[0]);
assertEquals(expectedLast, sequence[expectedSize - 1]);
}
}