/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.cf.taste.hadoop.preparation; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.ToolRunner; import org.apache.mahout.cf.taste.hadoop.EntityPrefWritable; import org.apache.mahout.cf.taste.hadoop.ToEntityPrefsMapper; import org.apache.mahout.cf.taste.hadoop.ToItemPrefsMapper; import org.apache.mahout.cf.taste.hadoop.item.ItemIDIndexMapper; import org.apache.mahout.cf.taste.hadoop.item.ItemIDIndexReducer; import org.apache.mahout.cf.taste.hadoop.item.RecommenderJob; import org.apache.mahout.cf.taste.hadoop.item.ToUserVectorsReducer; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.math.VarIntWritable; import org.apache.mahout.math.VarLongWritable; import org.apache.mahout.math.VectorWritable; import java.util.Map; public class PreparePreferenceMatrixJob extends AbstractJob { public static final String NUM_USERS = "numUsers.bin"; public static final String ITEMID_INDEX = "itemIDIndex"; public static final String USER_VECTORS = "userVectors"; public static final String RATING_MATRIX = "ratingMatrix"; private static final int DEFAULT_MIN_PREFS_PER_USER = 1; public static void main(String[] args) throws Exception { ToolRunner.run(new PreparePreferenceMatrixJob(), args); } @Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption("maxPrefsPerUser", "mppu", "max number of preferences to consider per user, " + "users with more preferences will be sampled down"); addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER)); addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString()); addOption("ratingShift", "rs", "shift ratings by this value", "0.0"); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser")); boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData")); float ratingShift = Float.parseFloat(parsedArgs.get("--ratingShift")); //convert items to an internal index Job itemIDIndex = prepareJob(getInputPath(), getOutputPath(ITEMID_INDEX), TextInputFormat.class, ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class, VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class); itemIDIndex.setCombinerClass(ItemIDIndexReducer.class); itemIDIndex.waitForCompletion(true); //convert user preferences into a vector per user Job toUserVectors = prepareJob(getInputPath(), getOutputPath(USER_VECTORS), TextInputFormat.class, ToItemPrefsMapper.class, VarLongWritable.class, booleanData ? VarLongWritable.class : EntityPrefWritable.class, ToUserVectorsReducer.class, VarLongWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); toUserVectors.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData); toUserVectors.getConfiguration().setInt(ToUserVectorsReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser); toUserVectors.getConfiguration().set(ToEntityPrefsMapper.RATING_SHIFT, String.valueOf(ratingShift)); toUserVectors.waitForCompletion(true); //we need the number of users later int numberOfUsers = (int) toUserVectors.getCounters().findCounter(ToUserVectorsReducer.Counters.USERS).getValue(); HadoopUtil.writeInt(numberOfUsers, getOutputPath(NUM_USERS), getConf()); //build the rating matrix Job toItemVectors = prepareJob(getOutputPath(USER_VECTORS), getOutputPath(RATING_MATRIX), ToItemVectorsMapper.class, IntWritable.class, VectorWritable.class, ToItemVectorsReducer.class, IntWritable.class, VectorWritable.class); toItemVectors.setCombinerClass(ToItemVectorsReducer.class); /* configure sampling regarding the uservectors */ if (parsedArgs.containsKey("--maxPrefsPerUser")) { int samplingSize = Integer.parseInt(parsedArgs.get("--maxPrefsPerUser")); toItemVectors.getConfiguration().setInt(ToItemVectorsMapper.SAMPLE_SIZE, samplingSize); } toItemVectors.waitForCompletion(true); return 0; } }