package ecologylab.bigsemantics.model.text;
import java.text.DecimalFormat;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Vector;
import ecologylab.generic.Debug;
/***
*
* @author rhema
*
* Stores terms ordered by tfidf score and will cache
* normalized sets of arbitrary maximum size.
*/
public class OrderedNormalizedTermVectorCache extends Debug
{
protected Vector<TermWithScore> orderedTerms;
protected HashMap<String, Vector<TermWithScore>> cachedNormalizedVectors;
public OrderedNormalizedTermVectorCache()
{
orderedTerms = new Vector<TermWithScore>();
cachedNormalizedVectors = new HashMap<String, Vector<TermWithScore>>();
}
public OrderedNormalizedTermVectorCache(CompositeTermVector compositeTermVector)
{
this();
//debug("A metadata has been created!");
Map<Term, Double> blah = compositeTermVector.map();//.tfIdfTrim(.00001, new TermVector());
for(Term t: blah.keySet())
{
if(t != null)
{
//debug(t.getWord() + " "+termVector.tfIdf(t));
orderedTerms.add(new TermWithScore(t, blah.get(t)));//compositeTermVector.tfIdf(t)));
}
else
{
//debug("Why is this null?");
}
}
Collections.sort(orderedTerms);
}
public Vector<TermWithScore> getOrderedTerms()
{
return orderedTerms;
}
public static int NO_MAX = -1;
public static double TF_ONLY = .5;
public Vector<TermWithScore> getNormalizedOrderedTerms(int max)
{
return getNormalizedOrderedTerms(max, TF_ONLY);
}
public Vector<TermWithScore> getNormalizedOrderedTerms(int max, double dfBonus)
{
DecimalFormat twoDForm = new DecimalFormat("#.#");
dfBonus = Double.valueOf(twoDForm.format(dfBonus));
String key = max+":"+String.format("%.2g%n", dfBonus);
if(cachedNormalizedVectors.containsKey(key))
return cachedNormalizedVectors.get(key);
Vector<TermWithScore> returnVector = new Vector<TermWithScore>();
if(max == NO_MAX)
max = orderedTerms.size();
double totalScore = 0;
//I'm not super confident about how we are normalizing this here...
//We are not going to take this and multiply because we want to save
//sparse terms and dot product is ruthless for our context.
int foundSoFar = 0;
for(TermWithScore nt : orderedTerms)
{
foundSoFar += 1;
totalScore += nt.getScore();
if(foundSoFar >= max)
break;
}
foundSoFar = 0;
for(TermWithScore nt : orderedTerms)
{
foundSoFar += 1;
returnVector.add(new TermWithScore(nt,nt.getScore()/totalScore, dfBonus));
if(foundSoFar >= max)
break;
}
cachedNormalizedVectors.put(key, returnVector);
return returnVector;
}
}