/* * Copyright 2011 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package de.tudarmstadt.ukp.dkpro.core.io.imscwb; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; import static org.junit.Assert.assertEquals; import java.io.File; import org.apache.commons.io.FileUtils; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReader; import org.apache.uima.fit.component.CasDumpWriter; import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.io.bnc.BncReader; import de.tudarmstadt.ukp.dkpro.core.io.negra.NegraExportReader; import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; import de.tudarmstadt.ukp.dkpro.core.snowball.SnowballStemmer; import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; /** * * */ public class ImsCwbWriterTest { private static final String outputFile = "target/corpus-sample.ims"; @Test public void test1() throws Exception { File dump = new File(testContext.getTestOutputFolder(), "/dump.txt"); File output = new File(testContext.getTestOutputFolder(), "/output.txt"); CollectionReader ner = createReader( NegraExportReader.class, NegraExportReader.PARAM_SOURCE_LOCATION, "src/test/resources/tuebadz/corpus-sample.export", NegraExportReader.PARAM_LANGUAGE, "de", NegraExportReader.PARAM_ENCODING, "UTF-8"); AnalysisEngineDescription tag = createEngineDescription( OpenNlpPosTagger.class); AnalysisEngineDescription tw = createEngineDescription( ImsCwbWriter.class, ImsCwbWriter.PARAM_TARGET_LOCATION, output, ImsCwbWriter.PARAM_TARGET_ENCODING, "UTF-8"); AnalysisEngineDescription cdw = createEngineDescription( CasDumpWriter.class, CasDumpWriter.PARAM_OUTPUT_FILE, dump); runPipeline(ner, tag, tw, cdw); String reference = FileUtils.readFileToString( new File("src/test/resources/tuebadz/corpus-sample-ref.txt"), "UTF-8"); String actual = FileUtils.readFileToString( output, "UTF-8"); assertEquals(reference, actual); } @Test public void testAdditionalFeatures() throws Exception { CollectionReader ner = createReader( NegraExportReader.class, NegraExportReader.PARAM_SOURCE_LOCATION, "src/test/resources/tuebadz/corpus-sample.export", NegraExportReader.PARAM_LANGUAGE, "de", NegraExportReader.PARAM_ENCODING, "UTF-8"); AnalysisEngineDescription tag = createEngineDescription( OpenNlpPosTagger.class); AnalysisEngineDescription stem = createEngineDescription( SnowballStemmer.class); AnalysisEngineDescription tw = createEngineDescription( ImsCwbWriter.class, ImsCwbWriter.PARAM_TARGET_LOCATION, outputFile, ImsCwbWriter.PARAM_TARGET_ENCODING, "UTF-8", ImsCwbWriter.PARAM_WRITE_CPOS, true, ImsCwbWriter.PARAM_ADDITIONAL_FEATURES, new String[] { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem/value" }); AnalysisEngineDescription cdw = createEngineDescription( CasDumpWriter.class, CasDumpWriter.PARAM_OUTPUT_FILE, "target/dump.txt"); runPipeline(ner, tag, stem, tw, cdw); String reference = FileUtils.readFileToString( new File("src/test/resources/tuebadz/corpus-sample-addfeat-ref.txt"), "UTF-8"); String actual = FileUtils.readFileToString( new File(outputFile), "UTF-8"); assertEquals(reference, actual); } @Ignore("FX8 is a file from the BNC. While available online for download, we currently do not " + "ship it due to licensing issues.") @Test public void test1a() throws Exception { CollectionReader ner = createReader( BncReader.class, BncReader.PARAM_SOURCE_LOCATION, "src/test/resources", BncReader.PARAM_PATTERNS, new String[] { "[+]FX8.xml" }, BncReader.PARAM_LANGUAGE, "en"); AnalysisEngineDescription tw = createEngineDescription( ImsCwbWriter.class, ImsCwbWriter.PARAM_TARGET_LOCATION, outputFile, ImsCwbWriter.PARAM_TARGET_ENCODING, "UTF-8"); AnalysisEngineDescription cdw = createEngineDescription( CasDumpWriter.class, CasDumpWriter.PARAM_OUTPUT_FILE, "target/dump.txt"); runPipeline(ner, tw, cdw); String reference = FileUtils.readFileToString( new File("src/test/resources/reference/bnc-sample.ims"), "UTF-8"); String actual = FileUtils.readFileToString( new File(outputFile), "UTF-8"); assertEquals(reference, actual); } @Ignore("This test cannot work (yet) because we do not ship the cwb-encode and cwb-makeall binaries") @Test public void test2() throws Exception { CollectionReader ner = createReader( NegraExportReader.class, NegraExportReader.PARAM_SOURCE_LOCATION, "src/test/resources/corpus-sample.export", NegraExportReader.PARAM_LANGUAGE, "de", NegraExportReader.PARAM_ENCODING, "UTF-8"); AnalysisEngineDescription tag = createEngineDescription( OpenNlpPosTagger.class); AnalysisEngineDescription tw = createEngineDescription( ImsCwbWriter.class, ImsCwbWriter.PARAM_TARGET_LOCATION, "target/cqbformat", ImsCwbWriter.PARAM_TARGET_ENCODING, "UTF-8", ImsCwbWriter.PARAM_CQP_HOME, "/Users/bluefire/bin/cwb-2.2.b99"); runPipeline(ner, tag, tw); } @Rule public DkproTestContext testContext = new DkproTestContext(); }