/* * Copyright 2013 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.frequency.tfidf; import static de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase.INCLUDE_PREFIX; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; import static org.apache.uima.fit.util.JCasUtil.select; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; import java.io.File; import java.util.HashMap; import java.util.Map; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.pipeline.JCasIterable; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.jcas.JCas; import org.junit.Before; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; import org.junit.rules.TestName; import de.tudarmstadt.ukp.dkpro.core.api.frequency.tfidf.type.Tfidf; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.TfidfAnnotator.WeightingModeIdf; import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.TfidfAnnotator.WeightingModeTf; import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; /** * * */ public class TfidfAnnotatorTest { // assertEquals on doubles needs an epsilon protected static final double EPSILON = 0.000001; private final static String CONSUMER_TEST_DATA_PATH = "src/test/resources/consumer/"; @Rule public TemporaryFolder folder = new TemporaryFolder(); protected File model; @Before public void buildModel() throws Exception { model = folder.newFile(); // write the model CollectionReaderDescription reader = createReaderDescription(TextReader.class, TextReader.PARAM_SOURCE_LOCATION, CONSUMER_TEST_DATA_PATH, TextReader.PARAM_PATTERNS, INCLUDE_PREFIX + "*.txt"); AnalysisEngineDescription aggregate = createEngineDescription( createEngineDescription(BreakIteratorSegmenter.class), createEngineDescription(TfidfConsumer.class, TfidfConsumer.PARAM_FEATURE_PATH, Token.class, TfidfConsumer.PARAM_TARGET_LOCATION, model)); SimplePipeline.runPipeline(reader, aggregate); } @Test public void tfidfTest_normal_constantOne() throws Exception { CollectionReaderDescription reader = createReaderDescription(TextReader.class, TextReader.PARAM_SOURCE_LOCATION, CONSUMER_TEST_DATA_PATH, TextReader.PARAM_PATTERNS, INCLUDE_PREFIX + "*.txt"); AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); AnalysisEngineDescription tfidfAnnotator = createEngineDescription(TfidfAnnotator.class, TfidfAnnotator.PARAM_FEATURE_PATH, Token.class, TfidfAnnotator.PARAM_TFDF_PATH, model, TfidfAnnotator.PARAM_TF_MODE, WeightingModeTf.NORMAL, TfidfAnnotator.PARAM_IDF_MODE, WeightingModeIdf.CONSTANT_ONE); Map<String, Double> expectedDoc1 = new HashMap<String, Double>(); expectedDoc1.put("example", 1.0); expectedDoc1.put("sentence", 1.0); expectedDoc1.put("funny", 1.0); Map<String, Double> expectedDoc2 = new HashMap<String, Double>(); expectedDoc2.put("example", 2.0); expectedDoc2.put("sentence", 1.0); for (JCas jcas : new JCasIterable(reader, segmenter, tfidfAnnotator)) { testIt(jcas, expectedDoc1, expectedDoc2); } } @Test public void tfidfTest_binary_binary() throws Exception { CollectionReaderDescription reader = createReaderDescription(TextReader.class, TextReader.PARAM_SOURCE_LOCATION, CONSUMER_TEST_DATA_PATH, TextReader.PARAM_PATTERNS, INCLUDE_PREFIX + "*.txt"); AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); AnalysisEngineDescription tfidfAnnotator = createEngineDescription(TfidfAnnotator.class, TfidfAnnotator.PARAM_FEATURE_PATH, Token.class, TfidfAnnotator.PARAM_TFDF_PATH, model, TfidfAnnotator.PARAM_TF_MODE, WeightingModeTf.BINARY, TfidfAnnotator.PARAM_IDF_MODE, WeightingModeIdf.BINARY); Map<String, Double> expectedDoc1 = new HashMap<String, Double>(); expectedDoc1.put("example", 1.0); expectedDoc1.put("sentence", 1.0); expectedDoc1.put("funny", 1.0); Map<String, Double> expectedDoc2 = new HashMap<String, Double>(); expectedDoc2.put("example", 1.0); expectedDoc2.put("sentence", 1.0); for (JCas jcas : new JCasIterable(reader, segmenter, tfidfAnnotator)) { testIt(jcas, expectedDoc1, expectedDoc2); } } @Test public void tfidfTest_normal_log() throws Exception { CollectionReaderDescription reader = createReaderDescription(TextReader.class, TextReader.PARAM_SOURCE_LOCATION, CONSUMER_TEST_DATA_PATH, TextReader.PARAM_PATTERNS, INCLUDE_PREFIX + "*.txt"); AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); AnalysisEngineDescription tfidfAnnotator = createEngineDescription(TfidfAnnotator.class, TfidfAnnotator.PARAM_FEATURE_PATH, Token.class, TfidfAnnotator.PARAM_TFDF_PATH, model, TfidfAnnotator.PARAM_TF_MODE, WeightingModeTf.NORMAL, TfidfAnnotator.PARAM_IDF_MODE, WeightingModeIdf.LOG); Map<String, Double> expectedDoc1 = new HashMap<String, Double>(); expectedDoc1.put("example", 0.0); expectedDoc1.put("sentence", 0.0); expectedDoc1.put("funny", Math.log(2)); Map<String, Double> expectedDoc2 = new HashMap<String, Double>(); expectedDoc2.put("example", 0.0); expectedDoc2.put("sentence", 0.0); for (JCas jcas : new JCasIterable(reader, segmenter, tfidfAnnotator)) { testIt(jcas, expectedDoc1, expectedDoc2); } } private void testIt(JCas jcas, Map<String, Double> expectedDoc1, Map<String, Double> expectedDoc2) { if (DocumentMetaData.get(jcas).getDocumentTitle().equals("test1.txt")) { int i = 0; for (Tfidf tfidf : select(jcas, Tfidf.class)) { assertEquals(tfidf.getTerm(), expectedDoc1.get(tfidf.getTerm()).doubleValue(), tfidf.getTfidfValue(), EPSILON); i++; } assertEquals(3, i); } else if (DocumentMetaData.get(jcas).getDocumentTitle().equals("test2.txt")) { int i = 0; for (Tfidf tfidf : select(jcas, Tfidf.class)) { assertEquals(tfidf.getTerm(), expectedDoc2.get(tfidf.getTerm()).doubleValue(), tfidf.getTfidfValue(), EPSILON); i++; } assertEquals(3, i); } else { fail("There should be no other documents in that directory."); } } @Rule public TestName name = new TestName(); @Before public void printSeparator() { System.out.println("\n=== " + name.getMethodName() + " ====================="); } }