/* * Sifarish: Recommendation Engine * Author: Pranab Ghosh * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.sifarish.social; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang3.tuple.Pair; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.chombo.util.SecondarySort; import org.chombo.util.Tuple; import org.chombo.util.Utility; /** * Computes item pair correlation by Pearson correlation. This is an alternative to * ItemDynamicAttributeSimilarity when Pearsonn correlation is used. Input is rating matrix. * @author pranab * */ public class PearsonCorrelator extends Configured implements Tool{ @Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "PearsonCorrelator MR"; job.setJobName(jobName); job.setJarByClass(PearsonCorrelator.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(PearsonCorrelator.PearsonMapper.class); job.setReducerClass(PearsonCorrelator.PrearsonReducer.class); job.setMapOutputKeyClass(Tuple.class); job.setMapOutputValueClass(Tuple.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class); job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class); Utility.setConfiguration(job.getConfiguration()); int numReducer = job.getConfiguration().getInt("pec.num.reducer", -1); numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer; job.setNumReduceTasks(numReducer); int status = job.waitForCompletion(true) ? 0 : 1; return status; } /** * Self join by hashing * @author pranab * */ public static class PearsonMapper extends Mapper<LongWritable, Text, Tuple, Tuple> { private int bucketCount; private int hash; private String fieldDelimRegex; private Integer hashPair; private String itemID; private Tuple keyHolder = new Tuple(); private Tuple valueHolder = new Tuple(); private int hashPairMult; private int hashCode; private int ratingScale; private String subFieldDelim; private static final Logger LOG = Logger.getLogger(PearsonCorrelator.PearsonMapper.class); /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context) */ protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); if (conf.getBoolean("debug.on", false)) { LOG.setLevel(Level.DEBUG); System.out.println("in debug mode"); } fieldDelimRegex = conf.get("field.delim.regex", ","); subFieldDelim = context.getConfiguration().get("subfield.delim", ":"); bucketCount = conf.getInt("pec.bucket.count", 10); hashPairMult = conf.getInt("pec.hash.pair.multiplier", 1000); ratingScale = context.getConfiguration().getInt("pec.rating.scale", 100); } /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context) */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] items = value.toString().split(fieldDelimRegex); itemID = items[0]; hashCode = itemID.hashCode(); if (hashCode < 0) { hashCode = - hashCode; } hash = (hashCode % bucketCount) / 2 ; boolean valueInitialized = false; for (int i = 0; i < bucketCount; ++i) { keyHolder.initialize(); if (i < hash){ hashPair = hash * hashPairMult + i; keyHolder.add(hashPair, Utility.ZERO); if (!valueInitialized) { createValueTuple(Utility.ZERO, items); valueInitialized = true; } } else { if (i == hash) { valueInitialized = false; } hashPair = i * hashPairMult + hash; keyHolder.add(hashPair, Utility.ONE); if (!valueInitialized) { createValueTuple(Utility.ONE, items); valueInitialized = true; } } context.write(keyHolder, valueHolder); } } /** * @param secKey * @param items */ private void createValueTuple(Integer secKey, String[] items) { valueHolder.initialize(); valueHolder.add(secKey, items[0]); //all userID and rating pair String[] subItems = null; String userID = null; Integer rating = 0; for (int i = 1; i < items.length; ++ i) { subItems = items[i].split(subFieldDelim); userID = subItems[0]; rating = ( Integer.parseInt(subItems[1])) * ratingScale; valueHolder.add(userID, rating); } } } /** * Correlation between memebers of 2 hash buckets * @author pranab * */ public static class PrearsonReducer extends Reducer<Tuple, Tuple, NullWritable, Text> { private Text valueHolder = new Text(); private String fieldDelim; private int hashPairMult; private int corrScale; private int minRatingSetIntersection; private int corr; private int corrWeight; private List<UserRating> userRatings = new ArrayList<UserRating>(); private static final Logger LOG = Logger.getLogger(PearsonCorrelator.PrearsonReducer.class); protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); if (conf.getBoolean("debug.on", false)) { LOG.setLevel(Level.DEBUG); System.out.println("in debug mode"); } fieldDelim = conf.get("field.delim", ","); hashPairMult = conf.getInt("pec.hash.pair.multiplier", 1000); corrScale = conf.getInt("pec.correlation.scale", 1000); minRatingSetIntersection = conf.getInt("pec.min.rating.intersection.set", 3); } /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context) */ protected void reduce(Tuple key, Iterable<Tuple> values, Context context) throws IOException, InterruptedException { int hashPair = key.getInt(0); UserRating userRating = null; UserRating userRatingSecond = null; if (hashPair / hashPairMult == hashPair % hashPairMult){ //same bucket userRatings.clear(); for (Tuple tuple : values) { userRating = new UserRating(tuple); userRatings.add(userRating); } //pair them for (int i = 0; i < userRatings.size(); ++i) { for (int j = i+1; j < userRatings.size(); ++ j ) { findCorrelation(userRatings.get(i), userRatings.get(j), context); if (corr > 0) { valueHolder.set(userRatings.get(i).getItemID() + fieldDelim + userRatings.get(j).getItemID() + fieldDelim + corr + fieldDelim + corrWeight); context.write(NullWritable.get(), valueHolder); } } } } else { //different bucket userRatings.clear(); for (Tuple tuple : values) { if (tuple.getInt(0) == Utility.ZERO) { userRating = new UserRating(tuple); userRatings.add(userRating); } else { userRatingSecond = new UserRating(tuple); //pair with each in the first set for (UserRating userRatingFirst : userRatings) { findCorrelation(userRatingFirst,userRatingSecond, context); if (corr > 0) { valueHolder.set(userRatingFirst.getItemID() + fieldDelim + userRatingSecond.getItemID() + fieldDelim + corr + fieldDelim + corrWeight); context.write(NullWritable.get(), valueHolder); } } } } } } /** * @param ratingOne * @param ratingTwo * @return */ private void findCorrelation(UserRating ratingOne, UserRating ratingTwo, Context context) { corr = 0; corrWeight = 0; ratingOne.initializeMatch(); ratingTwo.initializeMatch(); //find matching user rating for (int i = 0; i < ratingOne.getRatings().size(); ++i) { Pair<String, Integer> userRatingOne = ratingOne.getRatings().get(i); String userIDOne = userRatingOne.getLeft(); for(int j = 0; j < ratingTwo.getRatings().size(); ++j) { Pair<String, Integer> userRatingTwo = ratingTwo.getRatings().get(j); String userIDTwo = userRatingTwo.getLeft(); if (userIDOne.equals(userIDTwo)) { ratingOne.markMatched(i); ratingTwo.markMatched(j); break; } } } //only if rating set intersection length is greater than min if (ratingOne.getMatchCount() >= minRatingSetIntersection) { corrWeight = ratingOne.getMatchCount(); //mean and stad dev ratingOne.calculateStat(); ratingTwo.calculateStat(); LOG.debug("user match count:" + ratingOne.getMatchCount() ); LOG.debug("mean: " + ratingOne.getRatingMean() + " std dev:" + ratingOne.getRatingStdDev()); LOG.debug("mean: " + ratingTwo.getRatingMean() + " std dev:" + ratingTwo.getRatingStdDev()); //co variance int[] coVarItems = ratingOne.findCoVarianceItems(null); coVarItems = ratingTwo.findCoVarianceItems(coVarItems); int coVar = 0; for (int item : coVarItems) { coVar += item; } coVar /= coVarItems.length; if (coVar == 0) { context.getCounter("Pearson", "Zero covariance").increment(1); } //pearson correlation int stdDevProd = ratingOne.getRatingStdDev() * ratingTwo.getRatingStdDev(); if (stdDevProd == 0) { context.getCounter("Pearson", "Zero std dev").increment(1); } corr = stdDevProd == 0 ? corrScale : (coVar * corrScale) / stdDevProd; corr += corrScale; corr /= 2; } } } /** * List of users and associated ratings * @author pranab * */ public static class UserRating { private String itemID; private List<Pair<String, Integer>> ratings = new ArrayList<Pair<String, Integer>>(); private List<Integer> matchedRatings = new ArrayList<Integer>(); private int ratingMean; private int ratingStdDev; /** * @param tuple */ public UserRating(Tuple tuple) { super(); itemID = tuple.getString(1); for (int i = 2; i < tuple.getSize(); ) { String userID = tuple.getString(i++) ; Integer rating = tuple.getInt(i++); ratings.add(Pair.of(userID, rating)); } } /** * @return */ public String getItemID() { return itemID; } /** * @return */ public List<Pair<String, Integer>> getRatings() { return ratings; } /** * */ public void initializeMatch() { matchedRatings.clear(); } /** * @param index */ public void markMatched(Integer index) { matchedRatings.add(index); } /** * @return */ public int getMatchCount() { return matchedRatings.size(); } /** * @param index * @return */ public int getMatchedRating(int index) { return ratings.get(matchedRatings.get(index)).getRight(); } /** * */ public void calculateStat() { int ratingSum = 0; int ratingSquareSum = 0; int rating = 0; for (int index : matchedRatings) { rating = ratings.get(index).getRight(); ratingSum += rating; ratingSquareSum += rating * rating; } ratingMean = ratingSum / matchedRatings.size(); int var = ratingSquareSum / matchedRatings.size() - ratingMean * ratingMean; ratingStdDev = (int)Math.sqrt(var); } /** * @return */ public int getRatingMean() { return ratingMean; } /** * @return */ public int getRatingStdDev() { return ratingStdDev; } /** * @param coVarItems * @return */ public int[] findCoVarianceItems(int[] coVarItems) { int normRating = 0; if (null == coVarItems) { coVarItems = new int[matchedRatings.size()]; for (int i =0; i < matchedRatings.size(); ++i) { normRating = getMatchedRating(i) - ratingMean; coVarItems[i] =normRating; } } else { for (int i =0; i < matchedRatings.size(); ++i) { normRating = getMatchedRating(i) - ratingMean; coVarItems[i] *= normRating; } } return coVarItems; } } /** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new PearsonCorrelator(), args); System.exit(exitCode); } }