GenericItemSimilarity.java example

Explorer
mahout-rbmClassifier-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.cf.taste.impl.similarity;

import java.util.Collection;
import java.util.Iterator;

import com.google.common.collect.AbstractIterator;
import org.apache.mahout.cf.taste.common.Refreshable;
import org.apache.mahout.cf.taste.common.TasteException;
import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
import org.apache.mahout.cf.taste.impl.common.FastIDSet;
import org.apache.mahout.cf.taste.impl.recommender.TopItems;
import org.apache.mahout.cf.taste.model.DataModel;
import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
import org.apache.mahout.common.RandomUtils;

import com.google.common.base.Preconditions;

/**
 * <p>
 * A "generic" {@link ItemSimilarity} which takes a static list of precomputed item similarities and bases its
 * responses on that alone. The values may have been precomputed offline by another process, stored in a file,
 * and then read and fed into an instance of this class.
 * </p>
 * 
 * <p>
 * This is perhaps the best {@link ItemSimilarity} to use with
 * {@link org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender}, for now, since the point
 * of item-based recommenders is that they can take advantage of the fact that item similarity is relatively
 * static, can be precomputed, and then used in computation to gain a significant performance advantage.
 * </p>
 */
public final class GenericItemSimilarity implements ItemSimilarity {

  private static final long[] NO_IDS = new long[0];
  
  private final FastByIDMap<FastByIDMap<Double>> similarityMaps = new FastByIDMap<FastByIDMap<Double>>();
  private final FastByIDMap<FastIDSet> similarItemIDsIndex = new FastByIDMap<FastIDSet>();

  /**
   * <p>
   * Creates a  from a precomputed list of {@link ItemItemSimilarity}s. Each
   * represents the similarity between two distinct items. Since similarity is assumed to be symmetric, it is
   * not necessary to specify similarity between item1 and item2, and item2 and item1. Both are the same. It
   * is also not necessary to specify a similarity between any item and itself; these are assumed to be 1.0.
   * </p>
   *
   * <p>
   * Note that specifying a similarity between two items twice is not an error, but, the later value will win.
   * </p>
   *
   * @param similarities
   *          set of {@link ItemItemSimilarity}s on which to base this instance
   */
  public GenericItemSimilarity(Iterable<ItemItemSimilarity> similarities) {
    initSimilarityMaps(similarities.iterator());
  }

  /**
   * <p>
   * Like {@link #GenericItemSimilarity(Iterable)}, but will only keep the specified number of similarities
   * from the given {@link Iterable} of similarities. It will keep those with the highest similarity -- those
   * that are therefore most important.
   * </p>
   * 
   * <p>
   * Thanks to tsmorton for suggesting this and providing part of the implementation.
   * </p>
   * 
   * @param similarities
   *          set of {@link ItemItemSimilarity}s on which to base this instance
   * @param maxToKeep
   *          maximum number of similarities to keep
   */
  public GenericItemSimilarity(Iterable<ItemItemSimilarity> similarities, int maxToKeep) {
    Iterable<ItemItemSimilarity> keptSimilarities =
        TopItems.getTopItemItemSimilarities(maxToKeep, similarities.iterator());
    initSimilarityMaps(keptSimilarities.iterator());
  }

  /**
   * <p>
   * Builds a list of item-item similarities given an {@link ItemSimilarity} implementation and a
   * {@link DataModel}, rather than a list of {@link ItemItemSimilarity}s.
   * </p>
   * 
   * <p>
   * It's valid to build a  this way, but perhaps missing some of the point of an
   * item-based recommender. Item-based recommenders use the assumption that item-item similarities are
   * relatively fixed, and might be known already independent of user preferences. Hence it is useful to
   * inject that information, using {@link #GenericItemSimilarity(Iterable)}.
   * </p>
   * 
   * @param otherSimilarity
   *          other {@link ItemSimilarity} to get similarities from
   * @param dataModel
   *          data model to get items from
   * @throws TasteException
   *           if an error occurs while accessing the {@link DataModel} items
   */
  public GenericItemSimilarity(ItemSimilarity otherSimilarity, DataModel dataModel) throws TasteException {
    long[] itemIDs = GenericUserSimilarity.longIteratorToList(dataModel.getItemIDs());
    initSimilarityMaps(new DataModelSimilaritiesIterator(otherSimilarity, itemIDs));
  }

  /**
   * <p>
   * Like {@link #GenericItemSimilarity(ItemSimilarity, DataModel)} )}, but will only keep the specified
   * number of similarities from the given {@link DataModel}. It will keep those with the highest similarity
   * -- those that are therefore most important.
   * </p>
   * 
   * <p>
   * Thanks to tsmorton for suggesting this and providing part of the implementation.
   * </p>
   * 
   * @param otherSimilarity
   *          other {@link ItemSimilarity} to get similarities from
   * @param dataModel
   *          data model to get items from
   * @param maxToKeep
   *          maximum number of similarities to keep
   * @throws TasteException
   *           if an error occurs while accessing the {@link DataModel} items
   */
  public GenericItemSimilarity(ItemSimilarity otherSimilarity,
                               DataModel dataModel,
                               int maxToKeep) throws TasteException {
    long[] itemIDs = GenericUserSimilarity.longIteratorToList(dataModel.getItemIDs());
    Iterator<ItemItemSimilarity> it = new DataModelSimilaritiesIterator(otherSimilarity, itemIDs);
    Iterable<ItemItemSimilarity> keptSimilarities = TopItems.getTopItemItemSimilarities(maxToKeep, it);
    initSimilarityMaps(keptSimilarities.iterator());
  }

  private void initSimilarityMaps(Iterator<ItemItemSimilarity> similarities) {
    while (similarities.hasNext()) {
      ItemItemSimilarity iic = similarities.next();
      long similarityItemID1 = iic.getItemID1();
      long similarityItemID2 = iic.getItemID2();
      if (similarityItemID1 != similarityItemID2) {
        // Order them -- first key should be the "smaller" one
        long itemID1;
        long itemID2;
        if (similarityItemID1 < similarityItemID2) {
          itemID1 = similarityItemID1;
          itemID2 = similarityItemID2;
        } else {
          itemID1 = similarityItemID2;
          itemID2 = similarityItemID1;
        }
        FastByIDMap<Double> map = similarityMaps.get(itemID1);
        if (map == null) {
          map = new FastByIDMap<Double>();
          similarityMaps.put(itemID1, map);
        }
        map.put(itemID2, iic.getValue());

        doIndex(itemID1, itemID2);
        doIndex(itemID2, itemID1);
      }
      // else similarity between item and itself already assumed to be 1.0
    }
  }

  private void doIndex(long fromItemID, long toItemID) {
    FastIDSet similarItemIDs = similarItemIDsIndex.get(fromItemID);
    if (similarItemIDs == null) {
      similarItemIDs = new FastIDSet();
      similarItemIDsIndex.put(fromItemID, similarItemIDs);
    }
    similarItemIDs.add(toItemID);
  }

  /**
   * <p>
   * Returns the similarity between two items. Note that similarity is assumed to be symmetric, that
   * {@code itemSimilarity(item1, item2) == itemSimilarity(item2, item1)}, and that
   * {@code itemSimilarity(item1,item1) == 1.0} for all items.
   * </p>
   *
   * @param itemID1
   *          first item
   * @param itemID2
   *          second item
   * @return similarity between the two
   */
  @Override
  public double itemSimilarity(long itemID1, long itemID2) {
    if (itemID1 == itemID2) {
      return 1.0;
    }
    long firstID;
    long secondID;
    if (itemID1 < itemID2) {
      firstID = itemID1;
      secondID = itemID2;
    } else {
      firstID = itemID2;
      secondID = itemID1;
    }
    FastByIDMap<Double> nextMap = similarityMaps.get(firstID);
    if (nextMap == null) {
      return Double.NaN;
    }
    Double similarity = nextMap.get(secondID);
    return similarity == null ? Double.NaN : similarity;
  }

  @Override
  public double[] itemSimilarities(long itemID1, long[] itemID2s) {
    int length = itemID2s.length;
    double[] result = new double[length];
    for (int i = 0; i < length; i++) {
      result[i] = itemSimilarity(itemID1, itemID2s[i]);
    }
    return result;
  }

  @Override
  public long[] allSimilarItemIDs(long itemID) {
    FastIDSet similarItemIDs = similarItemIDsIndex.get(itemID);
    return similarItemIDs != null ? similarItemIDs.toArray() : NO_IDS;
  }
  
  @Override
  public void refresh(Collection<Refreshable> alreadyRefreshed) {
  // Do nothing
  }
  
  /** Encapsulates a similarity between two items. Similarity must be in the range [-1.0,1.0]. */
  public static final class ItemItemSimilarity implements Comparable<ItemItemSimilarity> {
    
    private final long itemID1;
    private final long itemID2;
    private final double value;
    
    /**
     * @param itemID1
     *          first item
     * @param itemID2
     *          second item
     * @param value
     *          similarity between the two
     * @throws IllegalArgumentException
     *           if value is NaN, less than -1.0 or greater than 1.0
     */
    public ItemItemSimilarity(long itemID1, long itemID2, double value) {
      Preconditions.checkArgument(value >= -1.0 && value <= 1.0, "Illegal value: %s", value);
      this.itemID1 = itemID1;
      this.itemID2 = itemID2;
      this.value = value;
    }
    
    public long getItemID1() {
      return itemID1;
    }
    
    public long getItemID2() {
      return itemID2;
    }
    
    public double getValue() {
      return value;
    }
    
    @Override
    public String toString() {
      return "ItemItemSimilarity[" + itemID1 + ',' + itemID2 + ':' + value + ']';
    }
    
    /** Defines an ordering from highest similarity to lowest. */
    @Override
    public int compareTo(ItemItemSimilarity other) {
      double otherValue = other.getValue();
      return value > otherValue ? -1 : value < otherValue ? 1 : 0;
    }
    
    @Override
    public boolean equals(Object other) {
      if (!(other instanceof ItemItemSimilarity)) {
        return false;
      }
      ItemItemSimilarity otherSimilarity = (ItemItemSimilarity) other;
      return otherSimilarity.getItemID1() == itemID1
          && otherSimilarity.getItemID2() == itemID2
          && otherSimilarity.getValue() == value;
    }
    
    @Override
    public int hashCode() {
      return (int) itemID1 ^ (int) itemID2 ^ RandomUtils.hashDouble(value);
    }
    
  }
  
  private static final class DataModelSimilaritiesIterator extends AbstractIterator<ItemItemSimilarity> {
    
    private final ItemSimilarity otherSimilarity;
    private final long[] itemIDs;
    private int i;
    private long itemID1;
    private int j;

    private DataModelSimilaritiesIterator(ItemSimilarity otherSimilarity, long[] itemIDs) {
      this.otherSimilarity = otherSimilarity;
      this.itemIDs = itemIDs;
      i = 0;
      itemID1 = itemIDs[0];
      j = 1;
    }

    @Override
    protected ItemItemSimilarity computeNext() {
      int size = itemIDs.length;
      ItemItemSimilarity result = null;
      while (result == null && i < size - 1) {
        long itemID2 = itemIDs[j];
        double similarity;
        try {
          similarity = otherSimilarity.itemSimilarity(itemID1, itemID2);
        } catch (TasteException te) {
          // ugly:
          throw new IllegalStateException(te);
        }
        if (!Double.isNaN(similarity)) {
          result = new ItemItemSimilarity(itemID1, itemID2, similarity);
        }
        if (++j == size) {
          itemID1 = itemIDs[++i];
          j = i + 1;
        }
      }
      if (result == null) {
        return endOfData();
      } else {
        return result;
      }
    }
    
  }
  
}