/*
* Copyright (C) 2012 Sebastian Schelter <sebastian.schelter [at] tu-berlin.de>
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*/
package de.tuberlin.dima.recsys.ssnmm.interactioncut;
import com.google.common.collect.Iterators;
import org.apache.mahout.cf.taste.eval.DataModelBuilder;
import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
import org.apache.mahout.cf.taste.impl.model.GenericDataModel;
import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
import org.apache.mahout.cf.taste.model.DataModel;
import org.apache.mahout.cf.taste.model.Preference;
import org.apache.mahout.cf.taste.model.PreferenceArray;
import org.apache.mahout.common.iterator.FixedSizeSamplingIterator;
import java.util.Arrays;
/**
* Applies the 'interaction-cut' (selective down-sampling of power users) to the dataset
*/
class InteractionCutDataModelBuilder implements DataModelBuilder {
private final int maxPrefsPerUser;
public InteractionCutDataModelBuilder(int maxPrefsPerUser) {
this.maxPrefsPerUser = maxPrefsPerUser;
}
@Override
public DataModel buildDataModel(FastByIDMap<PreferenceArray> trainingData) {
FastByIDMap<PreferenceArray> sampledTrainingData = new FastByIDMap<PreferenceArray>();
LongPrimitiveIterator userIDs = trainingData.keySetIterator();
while (userIDs.hasNext()) {
long userID = userIDs.nextLong();
PreferenceArray prefs = trainingData.get(userID);
if (prefs.length() > maxPrefsPerUser) {
Preference[] sampledPrefs = Iterators.toArray(new FixedSizeSamplingIterator<Preference>(
maxPrefsPerUser, prefs.iterator()), Preference.class);
sampledTrainingData.put(userID, new GenericUserPreferenceArray(Arrays.asList(sampledPrefs)));
} else {
sampledTrainingData.put(userID, prefs);
}
}
return new GenericDataModel(sampledTrainingData);
}
}