/* * Copyright 2011 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.web1t; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; import static org.junit.Assert.assertEquals; import java.io.File; import java.io.IOException; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReader; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.resource.ResourceInitializationException; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.clearnlp.ClearNlpLemmatizer; import de.tudarmstadt.ukp.dkpro.core.frequency.Web1TFileAccessProvider; import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; public class Web1TWriterTest { private final int MIN_NGRAM = 1; private final int MAX_NGRAM = 3; @Test public void web1TFormatTestWithTwoMultiSlashedTypesAsFeaturePath() throws Exception { File folder = testContext.getTestOutputFolder(); Web1TFileAccessProvider web1tProvider = prepareWeb1TFormatTest(folder, new String[] { Token.class.getName() + "/pos/PosValue", Token.class.getName() + "/lemma/value" }); assertEquals(1, web1tProvider.getFrequency("TO")); // "to" assertEquals(1, web1tProvider.getFrequency("NNS")); // "sentences" assertEquals(1, web1tProvider.getFrequency("EX")); // "there" assertEquals(1, web1tProvider.getFrequency("write")); assertEquals(0, web1tProvider.getFrequency("written")); } @Test public void web1TFormatTestWithMultiSlashedTypesAsFeaturePath() throws Exception { File folder = testContext.getTestOutputFolder(); Web1TFileAccessProvider web1tProvider = prepareWeb1TFormatTest(folder, new String[] { Token.class.getName() + "/lemma/value" }); assertEquals(1, web1tProvider.getFrequency("write")); assertEquals(0, web1tProvider.getFrequency("written")); assertEquals(4, web1tProvider.getFrequency("sentence")); } @Test public void web1TFormatTest_randomFrequencies() throws Exception { File folder = testContext.getTestOutputFolder(); Web1TFileAccessProvider web1tProvider = prepareWeb1TFormatTest(folder, new String[] { Token.class.getName() }); assertEquals(4, web1tProvider.getFrequency(".")); assertEquals(1, web1tProvider.getFrequency(",")); assertEquals(3, web1tProvider.getFrequency("sentence")); assertEquals(1, web1tProvider.getFrequency("written")); } @Test(expected = ResourceInitializationException.class) public void web1TFormatTest_exceptionForInvalidMinFrequency1() throws Exception { writeWeb1TFormat(new String[] { Token.class.getName() }, -1); } @Test(expected = ResourceInitializationException.class) public void web1TFormatTest_exceptionForInvalidMinFrequency2() throws Exception { writeWeb1TFormat(new String[] { Token.class.getName() }, 0); } private void writeWeb1TFormat(String[] strings, int minFreq) throws UIMAException, IOException { CollectionReader reader = createReader(TextReader.class, TextReader.PARAM_LANGUAGE, "en", TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/", TextReader.PARAM_PATTERNS, new String[] { "[+]**/*.txt" }); AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); AnalysisEngineDescription tagger = createEngineDescription(OpenNlpPosTagger.class); AnalysisEngineDescription lemmatizer = createEngineDescription(ClearNlpLemmatizer.class); AnalysisEngineDescription ngramWriter = createEngineDescription(Web1TWriter.class, Web1TWriter.PARAM_TARGET_LOCATION, testContext.getTestOutputFolder(), Web1TWriter.PARAM_INPUT_TYPES, strings, Web1TWriter.PARAM_MIN_NGRAM_LENGTH, MIN_NGRAM, Web1TWriter.PARAM_MAX_NGRAM_LENGTH, MAX_NGRAM, Web1TWriter.PARAM_MIN_FREQUENCY, minFreq); SimplePipeline.runPipeline(reader, segmenter, tagger, lemmatizer, ngramWriter); } private Web1TFileAccessProvider prepareWeb1TFormatTest(File target, String[] inputTypes) throws Exception { writeWeb1TFormat(target, inputTypes); Web1TFileAccessProvider web1tProvider = new Web1TFileAccessProvider("en", target, MIN_NGRAM, MAX_NGRAM); return web1tProvider; } private void writeWeb1TFormat(File target, String[] inputPath) throws Exception { CollectionReader reader = createReader(TextReader.class, TextReader.PARAM_LANGUAGE, "en", TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/", TextReader.PARAM_PATTERNS, new String[] { "[+]**/*.txt" }); AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); AnalysisEngineDescription tagger = createEngineDescription(OpenNlpPosTagger.class); AnalysisEngineDescription lemmatizer = createEngineDescription(ClearNlpLemmatizer.class); AnalysisEngineDescription ngramWriter = createEngineDescription(Web1TWriter.class, Web1TWriter.PARAM_TARGET_LOCATION, target, Web1TWriter.PARAM_INPUT_TYPES, inputPath, Web1TWriter.PARAM_MIN_NGRAM_LENGTH, MIN_NGRAM, Web1TWriter.PARAM_MAX_NGRAM_LENGTH, MAX_NGRAM); SimplePipeline.runPipeline(reader, segmenter, tagger, lemmatizer, ngramWriter); } @Rule public DkproTestContext testContext = new DkproTestContext(); }