TFIDFModelBuilder.java example

Explorer

coursera_recommender_systems-master
- PA1_Module_2
  - src
    - recsys
      - pb
        calculator
        calculate_ratings.java
        loader
        csv_loader.java
        main
        ratingsData.java
        recsys_code_pa1.java
        utils
        sort_map.java
        writer
        csv_writer.java
- PA2_Module_3
  - src
    - main
      - java
        org
        grouplens
        mooc
        cbf
        CBFMain.java
        TFIDFItemScorer.java
        TFIDFModel.java
        TFIDFModelBuilder.java
        dao
        CSVItemTagDAO.java
        ItemTagDAO.java
        ItemTitleDAO.java
        MOOCItemDAO.java
        MOOCRatingDAO.java
        MOOCUserDAO.java
        RatingFile.java
        TagFile.java
        TitleFile.java
        UserFile.java
        UserNameDAO.java
- PA2_Module_3-weight
  - src
    - main
      - java
        org
        grouplens
        mooc
        cbf
        CBFMain.java
        TFIDFItemScorer.java
        TFIDFModel.java
        TFIDFModelBuilder.java
        dao
        CSVItemTagDAO.java
        ItemTagDAO.java
        ItemTitleDAO.java
        MOOCItemDAO.java
        MOOCRatingDAO.java
        MOOCUserDAO.java
        RatingFile.java
        TagFile.java
        TitleFile.java
        UserFile.java
        UserNameDAO.java
- PA3_Module_4
  - src
    - main
      - java
        edu
        umn
        cs
        recsys
        dao
        ItemTitleDAO.java
        MOOCItemDAO.java
        MOOCRatingDAO.java
        MOOCUserDAO.java
        RatingFile.java
        TitleFile.java
        UserFile.java
        UserNameDAO.java
        uu
        SimpleUserUserItemScorer.java
        UUMain.java
- PA4_Module_5
  - src
    - main
      - java
        edu
        umn
        cs
        recsys
        PopularityItemScorer.java
        TagEntropyMetric.java
        TagVocabulary.java
        cbf
        LuceneItemItemModel.java
        LuceneModelBuilder.java
        dao
        CSVItemTagDAO.java
        ItemTagDAO.java
        ItemTitleDAO.java
        MOOCItemDAO.java
        MOOCRatingDAO.java
        MOOCUserDAO.java
        RatingFile.java
        TagFile.java
        TitleFile.java
        UserFile.java
        UserNameDAO.java
- PA5_Module_6
  - src
    - main
      - java
        edu
        umn
        cs
        recsys
        QueryDAOProvider.java
        dao
        CSVItemTagDAO.java
        ItemTagDAO.java
        ItemTitleDAO.java
        MOOCItemDAO.java
        MOOCRatingDAO.java
        MOOCUserDAO.java
        RatingFile.java
        TagFile.java
        TitleFile.java
        UserFile.java
        UserNameDAO.java
        ii
        IIMain.java
        SimpleGlobalItemScorer.java
        SimpleItemItemModel.java
        SimpleItemItemModelBuilder.java
        SimpleItemItemScorer.java
- PA6_Module_7
  - src
    - main
      - java
        edu
        umn
        cs
        recsys
        dao
        CSVItemTagDAO.java
        ItemTagDAO.java
        ItemTitleDAO.java
        MOOCItemDAO.java
        MOOCRatingDAO.java
        MOOCUserDAO.java
        RatingFile.java
        TagFile.java
        TitleFile.java
        UserFile.java
        UserNameDAO.java
        svd
        BaselineMode.java
        LatentFeatureCount.java
        SVDItemScorer.java
        SVDMain.java
        SVDModel.java
        SVDModelBuilder.java
- lenskit-demo
  - src
    - main
      - java
        org
        grouplens
        mooc
        demo
        App.java
    - test
      - java
        org
        grouplens
        mooc
        demo
        AppTest.java

package org.grouplens.mooc.cbf;

import com.google.common.collect.Maps;

import it.unimi.dsi.fastutil.longs.LongSet;
import org.grouplens.lenskit.core.Transient;
import org.grouplens.lenskit.vectors.MutableSparseVector;
import org.grouplens.lenskit.vectors.SparseVector;
import org.grouplens.lenskit.vectors.VectorEntry;
import org.grouplens.mooc.cbf.dao.ItemTagDAO;

import javax.inject.Inject;
import javax.inject.Provider;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * Builder for computing {@linkplain TFIDFModel TF-IDF models} from item tag data.  Each item is
 * represented by a normalized TF-IDF vector.
 *
 * @author <a href="http://www.grouplens.org">GroupLens Research</a>
 */
public class TFIDFModelBuilder implements Provider<TFIDFModel> {
    private final ItemTagDAO dao;

    /**
     * Construct a model builder.  The {@link Inject} annotation on this constructor tells LensKit
     * that it can be used to build the model builder.
     *
     * @param dao The item-tag DAO.  This is where the builder will get access to items and their
     *            tags.
     *            <p>{@link Transient} means that the provider promises that the DAO is no longer
     *            needed once the object is built (that is, the model will not contain a reference
     *            to the DAO).  This allows LensKit to configure your recommender components
     *            properly.  It's up to you to keep this promise.</p>
     */
    @Inject
    public TFIDFModelBuilder(@Transient ItemTagDAO dao) {
        this.dao = dao;
    }

    /**
     * This method is where the model should actually be computed.
     * @return The TF-IDF model (a model of item tag vectors).
     */
	@Override
    public TFIDFModel get() {
        // Build a map of tags to numeric IDs.  This lets you convert tags (which are strings)
        // into long IDs that you can use as keys in a tag vector.
        Map<String, Long> tagIds = buildTagIdMap();

        // Create a vector to accumulate document frequencies for the IDF computation
        MutableSparseVector docFreq = MutableSparseVector.create(tagIds.values());
        docFreq.fill(0);

        // We now proceed in 2 stages. First, we build a TF vector for each item.
        // While we do this, we also build the DF vector.
        // We will then apply the IDF to each TF vector and normalize it to a unit vector.

        // Create a map to store the item TF vectors.
        Map<Long,MutableSparseVector> itemVectors = Maps.newHashMap();

        // Create a work vector to accumulate each item's tag vector.
        // This vector will be re-used for each item.
        MutableSparseVector work = MutableSparseVector.create(tagIds.values());

        // Iterate over the items to compute each item's vector.
        LongSet items = dao.getItemIds();
        
        for (long item: items) {
            // Reset the work vector for this item's tags.
            work.clear();
            // Now the vector is empty (all keys are 'unset').  
            
            List<String> hashtag = new ArrayList<String>();
            
            for (String tag: dao.getItemTags(item)) {

            	Long id = tagIds.get(tag);
            	
            	try{
            		//if id is not in the key set, throw the Exception.
                	work.set(id, work.get(id) + 1);

                } catch (Exception e){
                	// if you catch the Exception, which means that id has not been set yet.
                	work.set(id,1.0);   // use set method to "set" the Key
                	
                }
            	
            	if(!hashtag.contains(tag)){
            		docFreq.set(id,docFreq.get(id) + 1);
            		hashtag.add(tag);
            	}
            }  
            
            // Save a shrunk copy of the vector (only storing tags that apply to this item) in
            // our map, we'll add IDF and normalize later.
            itemVectors.put(item, work.shrinkDomain());
            // work is ready to be reset and re-used for the next item
            
        }
        
        // Now we've seen all the items, so we have each item's TF vector and a global vector
        // of document frequencies.
        // Invert and log the document frequency.  We can do this in-place.
        for (VectorEntry e: docFreq.fast()) {
        	
        	docFreq.set(e.getKey(), Math.log(items.size()/e.getValue()));
        	
        }
                
        // Now docFreq is a log-IDF vector.
        // So we can use it to apply IDF to each item vector to put it in the final model.
        // Create a map to store the final model data.
        Map<Long,SparseVector> modelData = Maps.newHashMap();
        for (Map.Entry<Long,MutableSparseVector> entry: itemVectors.entrySet()) {
            MutableSparseVector tv = entry.getValue();
            
            // DA FARE Convert this vector to a TF-IDF vector
            for (VectorEntry e: tv.fast()) {
                tv.set(e.getKey(), ((e.getValue() * docFreq.get(e.getKey()))));
            }
            

            // DA FARE Normalize the TF-IDF vector to be a unit vector
            // HINT The method tv.norm() will give you the Euclidian length of the vector
            tv.multiply(1/tv.norm());
            
            // Store a frozen (immutable) version of the vector in the model data.
            modelData.put(entry.getKey(), tv.freeze());
        }
        

        // we technically don't need the IDF vector anymore, so long as we have no new tags
        return new TFIDFModel(tagIds, modelData);
    }

    /**
     * Build a mapping of tags to numeric IDs.
     *
     * @return A mapping from tags to IDs.
     */
    private Map<String,Long> buildTagIdMap() {
        // Get the universe of all tags
        Set<String> tags = dao.getTagVocabulary();
        // Allocate our new tag map
        Map<String,Long> tagIds = Maps.newHashMap();

        for (String tag: tags) {
            // Map each tag to a new number.
            tagIds.put(tag, tagIds.size() + 1L);
        }
        return tagIds;
    }
}