/*
* Copyright 2013
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.frequency.tfidf;
import java.util.Locale;
import java.util.Map.Entry;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathFactory;
import de.tudarmstadt.ukp.dkpro.core.api.frequency.tfidf.type.Tfidf;
import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.model.DfModel;
import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.model.DfStore;
import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.model.SharedDfModel;
import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.util.FreqDist;
import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.util.TermIterator;
import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.util.TfidfUtils;
/**
* This component adds {@link Tfidf} annotations consisting of a term and a tfidf weight. <br>
* The annotator is type agnostic concerning the input annotation, so you have to specify the
* annotation type and string representation. It uses a pre-serialized {@link DfStore}, which can be
* created using the {@link TfidfConsumer}.
*
*/
@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.frequency.tfidf.type.Tfidf" })
public class TfidfAnnotator
extends JCasAnnotator_ImplBase
{
/**
* This annotator is type agnostic, so it is mandatory to specify the type of the working
* annotation and how to obtain the string representation with the feature path.
*/
public static final String PARAM_FEATURE_PATH = "featurePath";
@ConfigurationParameter(name = PARAM_FEATURE_PATH, mandatory = true)
protected String featurePath;
/**
* Provide the path to the Df-Model. When a shared {@link SharedDfModel} is bound to this
* annotator, this is ignored.
*/
public static final String PARAM_TFDF_PATH = "tfdfPath";
@ConfigurationParameter(name = PARAM_TFDF_PATH, mandatory = false)
private String tfdfPath;
/**
* If set to true, the whole text is handled in lower case.
*/
public static final String PARAM_LOWERCASE = "lowercase";
@ConfigurationParameter(name = PARAM_LOWERCASE, mandatory = false, defaultValue = "false")
protected boolean lowercase;
/**
* The model for term frequency weighting.<br>
* Invoke toString() on an enum of {@link WeightingModeTf} for setup.
* <p>
* Default value is "NORMAL" yielding an unweighted tf.
*/
public static final String PARAM_TF_MODE = "weightingModeTf";
@ConfigurationParameter(name = PARAM_TF_MODE, mandatory = false, defaultValue = "NORMAL")
private WeightingModeTf weightingModeTf;
/**
* The model for inverse document frequency weighting.<br>
* Invoke toString() on an enum of {@link WeightingModeIdf} for setup.
* <p>
* Default value is "NORMAL" yielding an unweighted idf.
*/
public static final String PARAM_IDF_MODE = "weightingModeIdf";
@ConfigurationParameter(name = PARAM_IDF_MODE, mandatory = false, defaultValue = "NORMAL")
private WeightingModeIdf weightingModeIdf;
/**
* Available modes for term frequency
*/
public enum WeightingModeTf
{
BINARY, NORMAL, LOG, LOG_PLUS_ONE
}
/**
* Available modes for inverse document frequency
*/
public enum WeightingModeIdf
{
BINARY, CONSTANT_ONE, NORMAL, LOG
}
private DfModel dfModel;
@Override
public void initialize(UimaContext context)
throws ResourceInitializationException
{
super.initialize(context);
try {
dfModel = TfidfUtils.getDfModel(tfdfPath);
}
catch (Exception e) {
throw new ResourceInitializationException(e);
}
}
@Override
public void process(JCas jcas)
throws AnalysisEngineProcessException
{
FreqDist<String> termFrequencies = getTermFrequencies(jcas);
try {
for (Entry<AnnotationFS, String> entry : FeaturePathFactory.select(jcas.getCas(),
featurePath)) {
String term = entry.getValue();
if (lowercase) {
term = term.toLowerCase();
}
int tf = termFrequencies.getCount(term);
int df = dfModel.getDf(term);
if (df == 0) {
getContext().getLogger().log(Level.WARNING,
"Term [" + term + "] not found in dfStore!");
}
double tfidf = getWeightedTf(tf) * getWeightedIdf(df, dfModel.getDocumentCount());
logTfidf(term, tf, df, tfidf);
Tfidf tfidfAnnotation = new Tfidf(jcas);
tfidfAnnotation.setTerm(term);
tfidfAnnotation.setTfidfValue(tfidf);
tfidfAnnotation.setBegin(entry.getKey().getBegin());
tfidfAnnotation.setEnd(entry.getKey().getEnd());
tfidfAnnotation.addToIndexes();
}
}
catch (FeaturePathException e) {
throw new AnalysisEngineProcessException(e);
}
}
protected FreqDist<String> getTermFrequencies(JCas jcas)
throws AnalysisEngineProcessException
{
// count all terms with the given annotation
FreqDist<String> termFrequencies = new FreqDist<String>();
for (String term : TermIterator.create(jcas, featurePath, lowercase)) {
termFrequencies.count(term);
}
return termFrequencies;
}
/**
* Calculates a weighted tf according to given settings.
*/
private double getWeightedTf(int tf)
{
switch (weightingModeTf) {
case NORMAL:
return tf;
case LOG:
return tf > 0 ? Math.log(tf) : 0D;
case LOG_PLUS_ONE:
return tf > 0 ? Math.log(tf + 1) : 0D;
case BINARY:
return tf > 0 ? 1D : 0D;
default:
throw new IllegalStateException();
}
}
/**
* Calculates a weighted idf according to given settings.
*/
private double getWeightedIdf(int df, int n)
{
switch (weightingModeIdf) {
case NORMAL:
return (double) n / df;
case LOG:
return df > 0 ? Math.log((double) n / df) : 0D;
case CONSTANT_ONE:
return 1D;
case BINARY:
return df > 0 ? 1D : 0D;
default:
throw new IllegalStateException();
}
}
private void logTfidf(String term, int tf, int df, double tfidf)
{
if (getContext().getLogger().isLoggable(Level.FINEST)) {
getContext().getLogger().log(
Level.FINEST,
String.format(Locale.US, "\"%s\" (tf: %d, df: %d, tfidf: %.2f)", term, tf, df,
tfidf));
}
}
}