/*
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.annotator;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import static org.apache.uima.fit.factory.ExternalResourceFactory.createExternalResourceDescription;
import static org.hamcrest.CoreMatchers.is;
import static org.junit.Assert.assertThat;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.testing.factory.TokenBuilder;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.CompoundPart;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LinkingMorpheme;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource.AsvToolboxSplitterResource;
import de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource.FrequencyRankerResource;
import de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource.LeftToRightSplitterResource;
import de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource.RankerResource;
import de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource.SharedDictionary;
import de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource.SharedFinder;
import de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource.SharedLinkingMorphemes;
import de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource.SharedPatriciaTries;
import de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource.SplitterResource;
import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.LuceneIndexer;
public class CompoundAnnotatorTest
{
static File source = new File("src/test/resources/ranking/n-grams");
static File index = new File("target/test/index");
static String jWeb1TPath = "src/test/resources/web1t/de";
static String indexPath = "target/test/index";
@BeforeClass
public static void createIndex()
throws Exception
{
index.mkdirs();
LuceneIndexer indexer = new LuceneIndexer(source, index);
indexer.index();
}
@Test
public void testWithoutRanking() throws CASException, UIMAException {
AnalysisEngineDescription aed = createEngineDescription(
CompoundAnnotator.class,
CompoundAnnotator.PARAM_SPLITTING_ALGO,
createExternalResourceDescription(
LeftToRightSplitterResource.class,
SplitterResource.PARAM_DICT_RESOURCE,
createExternalResourceDescription(SharedDictionary.class),
SplitterResource.PARAM_MORPHEME_RESOURCE,
createExternalResourceDescription(SharedLinkingMorphemes.class)));
String[] splits = new String[] { "Aktion", "s", "plan", "Doppel","prozessormaschine"};
String[] compoundsParts = new String[] { "Aktion", "plan", "Doppel", "prozessormaschine"};
runAnnotator(aed, splits, compoundsParts);
}
@Test
public void testWithAsvToolbox() throws CASException, UIMAException {
AnalysisEngineDescription aed = createEngineDescription(
CompoundAnnotator.class,
CompoundAnnotator.PARAM_SPLITTING_ALGO,
createExternalResourceDescription(
AsvToolboxSplitterResource.class,
AsvToolboxSplitterResource.PARAM_DICT_RESOURCE,
createExternalResourceDescription(SharedDictionary.class),
AsvToolboxSplitterResource.PARAM_MORPHEME_RESOURCE,
createExternalResourceDescription(SharedLinkingMorphemes.class),
AsvToolboxSplitterResource.PARAM_PATRICIA_TRIES_RESOURCE,
createExternalResourceDescription(SharedPatriciaTries.class)),
CompoundAnnotator.PARAM_RANKING_ALGO,
createExternalResourceDescription(
FrequencyRankerResource.class,
RankerResource.PARAM_FINDER_RESOURCE,
createExternalResourceDescription(SharedFinder.class,
SharedFinder.PARAM_INDEX_PATH, indexPath,
SharedFinder.PARAM_NGRAM_LOCATION, jWeb1TPath)));
String[] splits = new String[] { "Aktion", "s", "plan", "Doppel","prozessormaschine",
"prozessor","maschine"};
String[] compoundsParts = new String[] { "Aktion", "plan", "Doppel", "prozessormaschine",
"prozessor","maschine"};
runAnnotator(aed, splits, compoundsParts);
}
@Test
public void testWithDefaults() throws CASException, UIMAException {
AnalysisEngineDescription aed = createEngineDescription(
CompoundAnnotator.class,
CompoundAnnotator.PARAM_SPLITTING_ALGO,
createExternalResourceDescription(
LeftToRightSplitterResource.class,
SplitterResource.PARAM_DICT_RESOURCE,
createExternalResourceDescription(SharedDictionary.class),
SplitterResource.PARAM_MORPHEME_RESOURCE,
createExternalResourceDescription(SharedLinkingMorphemes.class)),
CompoundAnnotator.PARAM_RANKING_ALGO,
createExternalResourceDescription(
FrequencyRankerResource.class,
RankerResource.PARAM_FINDER_RESOURCE,
createExternalResourceDescription(SharedFinder.class,
SharedFinder.PARAM_INDEX_PATH, indexPath,
SharedFinder.PARAM_NGRAM_LOCATION, jWeb1TPath)));
String[] splits = new String[] { "Aktion", "s", "plan", "Doppel","prozessormaschine",
"prozessor","maschine"};
String[] compoundsParts = new String[] { "Aktion", "plan", "Doppel", "prozessormaschine",
"prozessor","maschine"};
runAnnotator(aed, splits, compoundsParts);
}
private void runAnnotator(AnalysisEngineDescription aed, String[] splits,
String[] compoundsParts)
throws CASException, UIMAException{
// Create Analysis Engine
AnalysisEngine ae = AnalysisEngineFactory.createEngine(aed);
// Create cas with token
CAS cas = ae.newCAS();
TokenBuilder<Token, Annotation> builder = new TokenBuilder<Token, Annotation>(Token.class,
Annotation.class);
builder.buildTokens(cas.getJCas(), "Aktionsplan im Doppelprozessormaschine");
ae.typeSystemInit(cas.getTypeSystem());
ae.process(cas);
String[] compounds = new String[] {"Aktionsplan", "Doppelprozessormaschine"};
String[] linkingMorphemes = new String[] {"s"};
// Check if splits and morphemes are equal
assertThat(getAnnotation(cas.getJCas(), Compound.class), is(compounds));
assertThat(getAnnotation(cas.getJCas(), Split.class), is(splits));
assertThat(getAnnotation(cas.getJCas(), CompoundPart.class), is(compoundsParts));
assertThat(getAnnotation(cas.getJCas(), LinkingMorpheme.class), is(linkingMorphemes));
}
protected <T extends Annotation> String[] getAnnotation(JCas aCas, Class<T> aClass)
{
List<String> result = new ArrayList<String>();
for (T s : JCasUtil.select(aCas, aClass)) {
result.add(s.getCoveredText());
}
return result.toArray(new String[] {});
}
@AfterClass
public static void tearDown()
throws Exception
{
// Delete index again
for (File f : index.listFiles()) {
for (File _f : f.listFiles()) {
_f.delete();
}
f.delete();
}
index.delete();
}
}