package org.grouplens.mooc.cbf;
import com.google.common.collect.Maps;
import it.unimi.dsi.fastutil.longs.LongSet;
import org.grouplens.lenskit.core.Transient;
import org.grouplens.lenskit.vectors.MutableSparseVector;
import org.grouplens.lenskit.vectors.SparseVector;
import org.grouplens.lenskit.vectors.VectorEntry;
import org.grouplens.mooc.cbf.dao.ItemTagDAO;
import javax.inject.Inject;
import javax.inject.Provider;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* Builder for computing {@linkplain TFIDFModel TF-IDF models} from item tag data. Each item is
* represented by a normalized TF-IDF vector.
*
* @author <a href="http://www.grouplens.org">GroupLens Research</a>
*/
public class TFIDFModelBuilder implements Provider<TFIDFModel> {
private final ItemTagDAO dao;
/**
* Construct a model builder. The {@link Inject} annotation on this constructor tells LensKit
* that it can be used to build the model builder.
*
* @param dao The item-tag DAO. This is where the builder will get access to items and their
* tags.
* <p>{@link Transient} means that the provider promises that the DAO is no longer
* needed once the object is built (that is, the model will not contain a reference
* to the DAO). This allows LensKit to configure your recommender components
* properly. It's up to you to keep this promise.</p>
*/
@Inject
public TFIDFModelBuilder(@Transient ItemTagDAO dao) {
this.dao = dao;
}
/**
* This method is where the model should actually be computed.
* @return The TF-IDF model (a model of item tag vectors).
*/
@Override
public TFIDFModel get() {
// Build a map of tags to numeric IDs. This lets you convert tags (which are strings)
// into long IDs that you can use as keys in a tag vector.
Map<String, Long> tagIds = buildTagIdMap();
// Create a vector to accumulate document frequencies for the IDF computation
MutableSparseVector docFreq = MutableSparseVector.create(tagIds.values());
docFreq.fill(0);
// We now proceed in 2 stages. First, we build a TF vector for each item.
// While we do this, we also build the DF vector.
// We will then apply the IDF to each TF vector and normalize it to a unit vector.
// Create a map to store the item TF vectors.
Map<Long,MutableSparseVector> itemVectors = Maps.newHashMap();
// Create a work vector to accumulate each item's tag vector.
// This vector will be re-used for each item.
MutableSparseVector work = MutableSparseVector.create(tagIds.values());
// Iterate over the items to compute each item's vector.
LongSet items = dao.getItemIds();
for (long item: items) {
// Reset the work vector for this item's tags.
work.clear();
// Now the vector is empty (all keys are 'unset').
List<String> hashtag = new ArrayList<String>();
for (String tag: dao.getItemTags(item)) {
Long id = tagIds.get(tag);
try{
//if id is not in the key set, throw the Exception.
work.set(id, work.get(id) + 1);
} catch (Exception e){
// if you catch the Exception, which means that id has not been set yet.
work.set(id,1.0); // use set method to "set" the Key
}
if(!hashtag.contains(tag)){
docFreq.set(id,docFreq.get(id) + 1);
hashtag.add(tag);
}
}
// Save a shrunk copy of the vector (only storing tags that apply to this item) in
// our map, we'll add IDF and normalize later.
itemVectors.put(item, work.shrinkDomain());
// work is ready to be reset and re-used for the next item
}
// Now we've seen all the items, so we have each item's TF vector and a global vector
// of document frequencies.
// Invert and log the document frequency. We can do this in-place.
for (VectorEntry e: docFreq.fast()) {
docFreq.set(e.getKey(), Math.log(items.size()/e.getValue()));
}
// Now docFreq is a log-IDF vector.
// So we can use it to apply IDF to each item vector to put it in the final model.
// Create a map to store the final model data.
Map<Long,SparseVector> modelData = Maps.newHashMap();
for (Map.Entry<Long,MutableSparseVector> entry: itemVectors.entrySet()) {
MutableSparseVector tv = entry.getValue();
// DA FARE Convert this vector to a TF-IDF vector
for (VectorEntry e: tv.fast()) {
tv.set(e.getKey(), ((e.getValue() * docFreq.get(e.getKey()))));
}
// DA FARE Normalize the TF-IDF vector to be a unit vector
// HINT The method tv.norm() will give you the Euclidian length of the vector
tv.multiply(1/tv.norm());
// Store a frozen (immutable) version of the vector in the model data.
modelData.put(entry.getKey(), tv.freeze());
}
// we technically don't need the IDF vector anymore, so long as we have no new tags
return new TFIDFModel(tagIds, modelData);
}
/**
* Build a mapping of tags to numeric IDs.
*
* @return A mapping from tags to IDs.
*/
private Map<String,Long> buildTagIdMap() {
// Get the universe of all tags
Set<String> tags = dao.getTagVocabulary();
// Allocate our new tag map
Map<String,Long> tagIds = Maps.newHashMap();
for (String tag: tags) {
// Map each tag to a new number.
tagIds.put(tag, tagIds.size() + 1L);
}
return tagIds;
}
}