package uk.ac.shef.dcs.jate.algorithm;
import uk.ac.shef.dcs.jate.feature.FrequencyTermBased;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
*
*/
public abstract class ReferenceBased extends Algorithm{
protected double nullWordProbInReference;
//if a word is not found int the reference/general corpus,
//what probability should we use? Note this value has inverse relation wrt prob(wi) where wi is a word in the
//domain corpus. a value of 0.1 has the effect of raise prob(wi) by 10x.
//so a term that contains a word which does not exist in reference/general corpus can get an extremely high score
//by default we set it to the smallest word prob in the reference corpus
protected boolean matchOOM=true;
public ReferenceBased(boolean matchOOM){
this.matchOOM=matchOOM;
}
static double matchOrdersOfMagnitude(FrequencyTermBased fFeatureWords, FrequencyTermBased fFeatureRef) {
double totalScore=0, totalWords=0;
for(String t: fFeatureRef.getMapTerm2TTF().keySet()){
totalWords++;
totalScore+=fFeatureRef.getTTFNorm(t);
}
double meanRef=totalScore/totalWords;//mean normalized word freq
totalScore=0; totalWords=0;
for(String t: fFeatureWords.getMapTerm2TTF().keySet()){
totalWords++;
totalScore+=fFeatureWords.getTTFNorm(t);
}
double mean = totalScore/totalWords;
if(Double.isFinite(meanRef)){
int oomRef=(int)Math.log10(meanRef);
int oom = (int) Math.log10(mean);
double s = Math.pow(10, (oom-oomRef));
return s;
}
else
return 1.0;
}
static double setNullWordProbInReference(FrequencyTermBased ref) {
List<Integer> freq = new ArrayList<>(ref.getMapTerm2TTF().values());
Collections.sort(freq);
if(freq.size()>0) {
int min = freq.get(0);
return (double)min/ref.getCorpusTotal();
}
else
return 0.1;
}
}