/* * Copyright 2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.textnormalizer.transformation; import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTransformedText; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; import static org.junit.Assert.assertEquals; import java.io.File; import static org.apache.commons.io.FileUtils.*; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReader; import org.apache.uima.fit.pipeline.SimplePipeline; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; import de.tudarmstadt.ukp.dkpro.core.io.text.TokenizedTextWriter; import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpSegmenter; import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class HyphenationRemoverTest { private static final String RESOURCE_GERMAN_DICTIONARY = "src/test/resources/dictionary/ngerman"; @Test public void testHyphenationRemover() throws Exception { String inputText = "Ich habe ein- en super-tollen Bär-\nen."; String normalizedText = "Ich habe einen super-tollen Bären."; AnalysisEngineDescription normalizer = createEngineDescription(HyphenationRemover.class, HyphenationRemover.PARAM_MODEL_LOCATION, RESOURCE_GERMAN_DICTIONARY); assertTransformedText(normalizedText, inputText, "de", normalizer); } @Test public void testHyphenationRemoverInPipelineReaderWriter() throws Exception { File outputPath = testContext.getTestOutputFolder(); final String language = "de"; final String variant = "maxent"; String sourcePath = "src/test/resources/texts/test3.txt"; final String expected = "Ich habe einen super-tollen Bären .\n"+ "Für eine Registrierung einer Organisation und eine EMail Adresse .\n"; /* process input file */ final CollectionReader reader = createReader(TextReader.class, TextReader.PARAM_LANGUAGE, language, TextReader.PARAM_SOURCE_LOCATION, sourcePath); AnalysisEngineDescription hyphenationRemover = createEngineDescription( HyphenationRemover.class, HyphenationRemover.PARAM_MODEL_LOCATION, RESOURCE_GERMAN_DICTIONARY); AnalysisEngineDescription segmenter = createEngineDescription( OpenNlpSegmenter.class, OpenNlpSegmenter.PARAM_VARIANT, variant); AnalysisEngineDescription writer = createEngineDescription( TokenizedTextWriter.class, TokenizedTextWriter.PARAM_TARGET_LOCATION, new File(outputPath, "test3.txt"), TokenizedTextWriter.PARAM_SINGULAR_TARGET, true, TokenizedTextWriter.PARAM_OVERWRITE, true); SimplePipeline.runPipeline(reader, hyphenationRemover, segmenter, writer); assertEquals(expected, readFileToString(new File(outputPath, "test3.txt"), "UTF-8")); } @Rule public DkproTestContext testContext = new DkproTestContext(); }