/*
* Copyright 2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.textnormalizer.casfilter;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.collection.CollectionReaderDescription;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.pipeline.SimplePipeline;
import org.apache.uima.jcas.JCas;
import org.junit.Before;
import org.junit.Test;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.io.text.StringReader;
import de.tudarmstadt.ukp.dkpro.core.testing.dumper.CasDumpWriter;
public class CasFilter_ImplBaseTest
{
private static File tmpFile;
@Before
public void setUp()
{
tmpFile = new File("target/filteroutput.txt");
tmpFile.getParentFile().mkdirs();
//tmpFile.deleteOnExit();
}
@Test
public void testAnnotationFilterPass()
throws UIMAException, IOException
{
String input = "test";
String expectedFirstLine = "======== CAS 0 begin ==================================";
CollectionReaderDescription reader = createReaderDescription(StringReader.class,
StringReader.PARAM_DOCUMENT_TEXT, input,
StringReader.PARAM_LANGUAGE, "en");
AnalysisEngineDescription filter = createEngineDescription(AnnotationBasedFilter.class);
AnalysisEngineDescription annotator = createEngineDescription(TestAnnotator.class);
AnalysisEngineDescription writer = createEngineDescription(CasDumpWriter.class,
CasDumpWriter.PARAM_TARGET_LOCATION, tmpFile);
AnalysisEngineDescription aggregator = CasFilter_ImplBase
.createAggregateBuilderDescription(filter, writer);
SimplePipeline.runPipeline(reader, annotator, aggregator);
List<String> output = FileUtils.readLines(tmpFile);
assertEquals(expectedFirstLine, output.get(0));
assertEquals(input, output.get(13));
assertEquals("Sentence", output.get(15));
}
@Test
public void testAnnotationFilterRemove()
throws UIMAException, IOException
{
String input = "";
CollectionReaderDescription reader = createReaderDescription(StringReader.class,
StringReader.PARAM_DOCUMENT_TEXT, input,
StringReader.PARAM_LANGUAGE, "en");
AnalysisEngineDescription filter = createEngineDescription(AnnotationBasedFilter.class);
AnalysisEngineDescription annotator = createEngineDescription(TestAnnotator.class);
AnalysisEngineDescription writer = createEngineDescription(CasDumpWriter.class,
CasDumpWriter.PARAM_TARGET_LOCATION, tmpFile);
AnalysisEngineDescription aggregator = CasFilter_ImplBase
.createAggregateBuilderDescription(filter, writer);
SimplePipeline.runPipeline(reader, annotator, aggregator);
assertTrue(FileUtils.readFileToString(tmpFile).isEmpty());
}
@Test
public void testEmptyDocumentFilterRemove()
throws UIMAException, IOException
{
String input = "";
CollectionReaderDescription reader = createReaderDescription(StringReader.class,
StringReader.PARAM_DOCUMENT_TEXT, input,
StringReader.PARAM_LANGUAGE, "en");
AnalysisEngineDescription filter = createEngineDescription(EmptyDocumentFilter.class);
AnalysisEngineDescription writer = createEngineDescription(CasDumpWriter.class,
CasDumpWriter.PARAM_TARGET_LOCATION, tmpFile);
AnalysisEngineDescription aggregator = CasFilter_ImplBase
.createAggregateBuilderDescription(filter, writer);
SimplePipeline.runPipeline(reader, aggregator);
assertTrue(FileUtils.readFileToString(tmpFile).isEmpty());
}
@Test
public void testEmptyDocumentFilterPass()
throws UIMAException, IOException
{
String input = "test";
CollectionReaderDescription reader = createReaderDescription(StringReader.class,
StringReader.PARAM_DOCUMENT_TEXT, input,
StringReader.PARAM_LANGUAGE, "en");
AnalysisEngineDescription filter = createEngineDescription(EmptyDocumentFilter.class);
AnalysisEngineDescription writer = createEngineDescription(CasDumpWriter.class,
CasDumpWriter.PARAM_TARGET_LOCATION, tmpFile);
AnalysisEngineDescription aggregator = CasFilter_ImplBase
.createAggregateBuilderDescription(filter, writer);
SimplePipeline.runPipeline(reader, aggregator);
assertFalse(FileUtils.readFileToString(tmpFile).isEmpty());
}
@Test
public void testLanguageFilterPass()
throws UIMAException, IOException
{
String input = "test";
CollectionReaderDescription reader = createReaderDescription(StringReader.class,
StringReader.PARAM_DOCUMENT_TEXT, input,
StringReader.PARAM_LANGUAGE, "en");
AnalysisEngineDescription filter = createEngineDescription(LanguageFilter.class,
LanguageFilter.PARAM_REQUIRED_LANGUAGES, new String[] { "de", "en" });
AnalysisEngineDescription writer = createEngineDescription(CasDumpWriter.class,
CasDumpWriter.PARAM_TARGET_LOCATION, tmpFile);
AnalysisEngineDescription aggregator = CasFilter_ImplBase
.createAggregateBuilderDescription(filter, writer);
SimplePipeline.runPipeline(reader, aggregator);
assertFalse(FileUtils.readFileToString(tmpFile).isEmpty());
}
@Test
public void testLanguageFilterRemove()
throws UIMAException, IOException
{
String input = "test";
CollectionReaderDescription reader = createReaderDescription(StringReader.class,
StringReader.PARAM_DOCUMENT_TEXT, input,
StringReader.PARAM_LANGUAGE, "ch");
AnalysisEngineDescription filter = createEngineDescription(LanguageFilter.class,
LanguageFilter.PARAM_REQUIRED_LANGUAGES, new String[] { "de", "en" });
AnalysisEngineDescription writer = createEngineDescription(CasDumpWriter.class,
CasDumpWriter.PARAM_TARGET_LOCATION, tmpFile);
AnalysisEngineDescription aggregator = CasFilter_ImplBase
.createAggregateBuilderDescription(filter, writer);
SimplePipeline.runPipeline(reader, aggregator);
assertTrue(FileUtils.readFileToString(tmpFile).isEmpty());
}
public static class TestAnnotator
extends JCasAnnotator_ImplBase
{
/**
* Create one sentence over the full text unless the text is empty.
*
* @param aJCas
* @throws AnalysisEngineProcessException
*/
@Override
public void process(JCas aJCas)
throws AnalysisEngineProcessException
{
String text = aJCas.getDocumentText();
if (text.length() > 0) {
Sentence sentence = new Sentence(aJCas);
sentence.setBegin(0);
sentence.setEnd(text.length());
sentence.addToIndexes(aJCas);
}
}
}
public static class AnnotationBasedFilter
extends CasFilter_ImplBase
{
/**
* filter out documents that do not contain any sentence annotation.
*/
@Override
protected boolean pass(JCas aJCas)
{
return select(aJCas, Sentence.class).size() > 0;
}
}
public static class EmptyDocumentFilter
extends CasFilter_ImplBase
{
@Override
protected boolean pass(JCas aJCas)
{
return aJCas.getDocumentText().length() > 0;
}
}
public static class LanguageFilter
extends CasFilter_ImplBase
{
public static final String PARAM_REQUIRED_LANGUAGES = "requiredLanguages";
@ConfigurationParameter(name = PARAM_REQUIRED_LANGUAGES, mandatory = true)
Set<String> requiredLanguages;
@Override
protected boolean pass(JCas aJCas)
{
return requiredLanguages.contains(aJCas.getDocumentLanguage());
}
}
}