/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * <p> * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.frequency.phrasedetection; import de.tudarmstadt.ukp.dkpro.core.io.text.StringReader; import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.resource.ResourceInitializationException; import org.junit.Rule; import org.junit.Test; import java.io.File; import java.io.IOException; import java.nio.file.Files; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertTrue; public class FrequencyCounterTest { @Rule public DkproTestContext testContext = new DkproTestContext(); @Test public void testCount() throws Exception { int minCount = 1; File targetFile = new File(DkproTestContext.get().getTestOutputFolder(), "counts.txt"); File expectedFile = new File("src/test/resources/phrasedetection/counts.txt"); String sentence = "This is a first test that contains a first test example"; String language = "en"; CollectionReaderDescription reader = createReaderDescription(StringReader.class, StringReader.PARAM_DOCUMENT_TEXT, sentence, StringReader.PARAM_LANGUAGE, language); AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); AnalysisEngineDescription writer = createEngineDescription(FrequencyCounter.class, FrequencyCounter.PARAM_TARGET_LOCATION, targetFile, FrequencyCounter.PARAM_MIN_COUNT, minCount); writer.doFullValidation(); SimplePipeline.runPipeline(reader, segmenter, writer); assertTrue(targetFile.exists()); assertArrayEquals( Files.lines(expectedFile.toPath()).sorted().toArray(), Files.lines(targetFile.toPath()).sorted().toArray()); } @Test public void testCountSortedAlphabetically() throws Exception { int minCount = 1; File targetFile = new File(DkproTestContext.get().getTestOutputFolder(), "counts.txt"); File expectedFile = new File( "src/test/resources/phrasedetection/counts_sorted_alphabetically.txt"); String sentence = "This is a first test that contains a first test example"; String language = "en"; CollectionReaderDescription reader = createReaderDescription(StringReader.class, StringReader.PARAM_DOCUMENT_TEXT, sentence, StringReader.PARAM_LANGUAGE, language); AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); AnalysisEngineDescription writer = createEngineDescription(FrequencyCounter.class, FrequencyCounter.PARAM_TARGET_LOCATION, targetFile, FrequencyCounter.PARAM_MIN_COUNT, minCount, FrequencyCounter.PARAM_SORT_BY_ALPHABET, true); SimplePipeline.runPipeline(reader, segmenter, writer); assertArrayEquals("Alphabetic sorting invalid.", Files.readAllBytes(expectedFile.toPath()), Files.readAllBytes(targetFile.toPath())); } @Test public void testCountSortedByValue() throws Exception { int minCount = 1; File targetFile = new File(DkproTestContext.get().getTestOutputFolder(), "counts.txt"); String sentence = "This is a first test that contains a first test example"; String language = "en"; CollectionReaderDescription reader = createReaderDescription(StringReader.class, StringReader.PARAM_DOCUMENT_TEXT, sentence, StringReader.PARAM_LANGUAGE, language); AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); AnalysisEngineDescription writer = createEngineDescription(FrequencyCounter.class, FrequencyCounter.PARAM_TARGET_LOCATION, targetFile, FrequencyCounter.PARAM_MIN_COUNT, minCount, FrequencyCounter.PARAM_SORT_BY_COUNT, true); SimplePipeline.runPipeline(reader, segmenter, writer); assertTrue(targetFile.exists()); /* check unigram sorting */ double[] unigrams = Files.lines(targetFile.toPath()) .filter(line -> !line.equals(FrequencyCounter.NGRAM_SEPARATOR_LINE)) .map(line -> line.split(FrequencyCounter.COLUMN_SEPARATOR)) .filter(fields -> !fields[0].contains(FrequencyCounter.BIGRAM_SEPARATOR)) .map(fields -> fields[1]) .mapToDouble(Double::parseDouble) .toArray(); for (int i = 0; i < unigrams.length - 1; i++) { assertTrue(unigrams[i] >= unigrams[i + 1]); } /* check bigram sorting */ double[] bigrams = Files.lines(targetFile.toPath()) .filter(line -> !line.equals(FrequencyCounter.NGRAM_SEPARATOR_LINE)) .map(line -> line.split(FrequencyCounter.COLUMN_SEPARATOR)) .filter(fields -> fields[0].contains(FrequencyCounter.BIGRAM_SEPARATOR)) .map(fields -> fields[1]) .mapToDouble(Double::parseDouble) .toArray(); for (int i = 0; i < bigrams.length - 1; i++) { assertTrue(bigrams[i] >= bigrams[i + 1]); } } @Test(expected = ResourceInitializationException.class) public void testSortBoth() throws IOException, UIMAException { String sentence = "This is a first test that contains a first test example"; String language = "en"; CollectionReaderDescription reader = createReaderDescription(StringReader.class, StringReader.PARAM_DOCUMENT_TEXT, sentence, StringReader.PARAM_LANGUAGE, language); AnalysisEngineDescription writer = createEngineDescription(FrequencyCounter.class, FrequencyCounter.PARAM_SORT_BY_COUNT, true, FrequencyCounter.PARAM_SORT_BY_ALPHABET, true); SimplePipeline.runPipeline(reader, writer); } }