/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.cf.taste.impl.transforms; import java.util.Collection; import java.util.Map; import java.util.concurrent.Callable; import org.apache.mahout.cf.taste.common.Refreshable; import org.apache.mahout.cf.taste.common.TasteException; import org.apache.mahout.cf.taste.impl.common.FastByIDMap; import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; import org.apache.mahout.cf.taste.impl.common.RefreshHelper; import org.apache.mahout.cf.taste.model.DataModel; import org.apache.mahout.cf.taste.model.Preference; import org.apache.mahout.cf.taste.model.PreferenceArray; import org.apache.mahout.cf.taste.transforms.PreferenceTransform; import com.google.common.base.Preconditions; /** * <p> * Implements an "inverse user frequency" transformation, which boosts preference values for items for which * few users have expressed a preference, and reduces preference values for items for which many users have * expressed a preference. The idea is that these "rare" items are more useful in deciding how similar two * users' tastes are, and so should be emphasized in other calculatioons. This idea is mentioned in <a * href="ftp://ftp.research.microsoft.com/pub/tr/tr-98-12.pdf">Empirical Analysis of Predictive Algorithms for * Collaborative Filtering</a>. * </p> * * <p> * A scaling factor is computed for each item by dividing the total number of users by the number of users * expressing a preference for that item, and taking the log of that value. The log base of this calculation * can be controlled in the constructor. Intuitively, the right value for the base is equal to the average * number of users who express a preference for each item in your model. If each item has about 100 * preferences on average, 100.0 is a good log base. * </p> */ public final class InverseUserFrequency implements PreferenceTransform { private final DataModel dataModel; private final RefreshHelper refreshHelper; private final double logBase; private FastByIDMap<Double> iufFactors; /** * <p> * Creates a transformation. Computations use the given log base. * </p> * * @param dataModel * {@link DataModel} from which to calculate user frequencies * @param logBase * calculation logarithm base * @throws IllegalArgumentException * if dataModel is {@code null} or logBase is {@link Double#NaN} or <= 1.0 */ public InverseUserFrequency(DataModel dataModel, double logBase) throws TasteException { Preconditions.checkArgument(logBase > 1.0, "logBase should be > 1.0"); this.dataModel = Preconditions.checkNotNull(dataModel); this.logBase = logBase; this.iufFactors = new FastByIDMap<Double>(); this.refreshHelper = new RefreshHelper(new Callable<Object>() { @Override public Object call() throws TasteException { recompute(); return null; } }); this.refreshHelper.addDependency(this.dataModel); recompute(); } /** @return log base used in this object's calculations */ public double getLogBase() { return logBase; } @Override public float getTransformedValue(Preference pref) { Double factor = iufFactors.get(pref.getItemID()); if (factor != null) { return (float) (pref.getValue() * factor); } return pref.getValue(); } @Override public void refresh(Collection<Refreshable> alreadyRefreshed) { refreshHelper.refresh(alreadyRefreshed); } private void recompute() throws TasteException { Counters itemPreferenceCounts = new Counters(); int numUsers = 0; LongPrimitiveIterator it = dataModel.getUserIDs(); while (it.hasNext()) { PreferenceArray prefs = dataModel.getPreferencesFromUser(it.nextLong()); int size = prefs.length(); for (int i = 0; i < size; i++) { itemPreferenceCounts.increment(prefs.getItemID(i)); } numUsers++; } FastByIDMap<Double> newIufFactors = new FastByIDMap<Double>(itemPreferenceCounts.size()); double logFactor = Math.log(logBase); for (Map.Entry<Long,int[]> entry : itemPreferenceCounts.getEntrySet()) { newIufFactors.put(entry.getKey(), Math.log((double) numUsers / (double) entry.getValue()[0]) / logFactor); } iufFactors = newIufFactors; } @Override public String toString() { return "InverseUserFrequency[logBase:" + logBase + ']'; } }