/*
* Copyright 2011
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.api.frequency.provider;
import java.io.IOException;
import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyUtils;
public abstract class FrequencyCountProviderBase
implements FrequencyCountProvider
{
/**
* Setting this to something higher than 1, will linearly scale down the returned frequency
* counts. E.g. setting it to 10, will return 1/10 of the actual frequency counts. This way,
* smaller n-gram models can be estimated.
*
* The same functionality could be implemented in the components using the provider, but they
* all needed to be aware of that down-scaling then which is undesirable. However, it might be
* much faster, as the actual frequency count only needs to be retrieved once, and different
* down-scaling factors can be tried instantly. So in some situations, it might be more
* efficient to return the unscaled score and do it yourself.
*
* Keep in mind that the resulting n-gram model is only a rough estimate of a really down-scaled
* model. Especially normalizing with the number of n-grams will not give exact results, as some
* of them now have zero counts and should not be counted. As long as one stays within the same
* n-gram model, the effects should be relative and can be ignored. However, comparing the
* relative frequencies from a down-scaled model with another model is invalid.
*/
private int scaleDownFactor = 1;
@Override
public double getProbability(String phrase)
throws IOException
{
long n = getNrOfNgrams(FrequencyUtils.getPhraseLength(phrase));
if (n == 0) {
throw new IOException("Requesting probability of a phrase for which no total phrase count information is available.");
}
long f = getFrequency(phrase);
// TODO we need real language models with backoff and smoothing
if (f == 0) {
f = 1;
}
return (double) f / n;
}
@Override
public double getLogProbability(String phrase)
throws IOException
{
double probability = getProbability(phrase);
double logProbability = Math.log(probability);
return logProbability;
}
@Override
public long getFrequency(String phrase)
throws IOException
{
long frequency = getFrequencyFromProvider(phrase);
return frequency / getScaleDownFactor();
}
protected abstract long getFrequencyFromProvider(String phrase)
throws IOException;
public double getLogLikelihood(int termFrequency, int sizeOfCorpus, String term)
throws IOException
{
return FrequencyUtils.loglikelihood(termFrequency, sizeOfCorpus, getFrequency(term),
getNrOfTokens());
}
public int getScaleDownFactor()
{
return scaleDownFactor;
}
public void setScaleDownFactor(int scaleDownFactor)
{
if (scaleDownFactor > 0) {
this.scaleDownFactor = scaleDownFactor;
}
else {
System.err.println("Invalid scale down factor. It needs to be larger than 0.");
}
}
@Override
public String getID()
throws IllegalArgumentException
{
return this.getClass().getSimpleName();
}
}