UtilityPredictor.java example

Explorer

SIF-master
- sifarish-master
  - src
    - main
      - java
        org
        sifarish
        common
        AttributeBasedDiversifier.java
        BusinessGoalInjector.java
        CompactRatingFormatter.java
        CorrelationMatrixBuilder.java
        EngagementScore.java
        EngagementToPreferenceMapper.java
        GlobalNovelty.java
        ImplicitRatingEstimator.java
        IndividualNovelty.java
        ItemDynamicAttributeSimilarity.java
        ItemEngagementDistr.java
        ItemRatingAttributeAggregator.java
        NewItemUtility.java
        PositiveFeedbackBasedRankReorderer.java
        RatedItem.java
        RatedItemWithAttributes.java
        RatingBlender.java
        ResourceDescribedEntity.java
        TaggedEntity.java
        TextAnalyzer.java
        UtilityAggregator.java
        UtilityPredictor.java
        etl
        CountryStandardFormat.java
        StructuredTextAnalyzer.java
        StructuredTextNormalizer.java
        TextFieldTokenNormalizer.java
        UnitedStatesStandardFormat.java
        feature
        CharacterPairSimilarity.java
        CosineSimilarity.java
        DiceSimilarity.java
        DiffTypeSimilarity.java
        DistanceStrategy.java
        DynamicAttrSimilarityStrategy.java
        EditDistanceSimilarity.java
        EuclideanDistance.java
        JaccardSimilarity.java
        ManhattanDistance.java
        MinkwoskiDistance.java
        MixedTypeSchema.java
        NearestNeighborClassifier.java
        ProfileItemSimilarity.java
        RecordDistanceFinder.java
        SameTypeSimilarity.java
        SemanticSimilarity.java
        SingleTypeSchema.java
        TextIntInt.java
        TextIntPair.java
        TopMatches.java
        TypeSchema.java
        VotingClassifier.java
        realtime
        DitheringBolt.java
        DitheringSpout.java
        DitheringTopology.java
        RealtimeUtil.java
        RecommenderBolt.java
        RecommenderTopology.java
        RedisSpout.java
        TrendingAggregateBolt.java
        TrendingSketchesBolt.java
        TrendingSpout.java
        TrendingTopology.java
        UserItemRatings.java
        social
        ItemRatingStat.java
        PearsonCorrelator.java
        RatingDifference.java
        RatingPredictor.java
        SlopeOneRating.java
        util
        CategoricalDistance.java
        ConceptHierarchy.java
        Entity.java
        Event.java
        Field.java
        FieldExtractor.java
        FieldMapping.java
        HourWindow.java
        IDistanceStrategy.java
        Location.java
        MatchingProfile.java
        StructuredAttribute.java
        TimeWindow.java
        Utility.java

/*
 * Sifarish: Recommendation Engine
 * Author: Pranab Ghosh
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0 
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package org.sifarish.common;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import  org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.chombo.util.TextInt;
import org.chombo.util.Tuple;
import org.chombo.util.Utility;

/**
 * Predicts rating for an user and item. based on another item the user has rated and the 
 * correlation between the items. This is the second MR to run after rating correlations are available
 * @author pranab
 *
 */
public class UtilityPredictor extends Configured implements Tool{
    @Override
    public int run(String[] args) throws Exception   {
        Job job = new Job(getConf());
        String jobName = "Rating predictor  MR";
        job.setJobName(jobName);
        
        job.setJarByClass(UtilityPredictor.class);
        
        FileInputFormat.addInputPaths(job, args[0]);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        job.setMapperClass(UtilityPredictor.PredictionMapper.class);
        job.setReducerClass(UtilityPredictor.PredictorReducer.class);
        
        job.setMapOutputKeyClass(TextInt.class);
        job.setMapOutputValueClass(Tuple.class);

        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);
 
        job.setGroupingComparatorClass(ItemIdGroupComprator.class);
        job.setPartitionerClass(ItemIdPartitioner.class);

        Utility.setConfiguration(job.getConfiguration());
        int numReducer = job.getConfiguration().getInt("utp.num.reducer", -1);
        numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
        job.setNumReduceTasks(numReducer);
        
        int status =  job.waitForCompletion(true) ? 0 : 1;
        return status;
    }
    
    /**
     * @author pranab
     *
     */
    public static class PredictionMapper extends Mapper<LongWritable, Text, TextInt, Tuple> {
    	private String fieldDelim;
    	private String subFieldDelim;
    	private boolean isRatingFileSplit;
    	private TextInt keyOut = new TextInt();
    	private Tuple valOut = new Tuple();
    	private String[] ratings;
    	private Integer two = 2;
    	private Integer one = 1;
    	private Integer zero = 0;
    	private boolean linearCorrelation;
    	private boolean isRatingStatFileSplit;
    	private long  ratingTimeCutoff;
    	private  long timeStamp;
    	private int minInputRating;
    	private  int inputRating;
    	private int minCorrelation;
    	private int correlation;
    	private int  correlationLength;
    	private boolean userRatingWithContext;
    	private String ratingContext;
    	private static final int STD_DEV_ORD = 3;
    	
        private static final Logger LOG = Logger.getLogger(UtilityPredictor.PredictionMapper.class);

        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context)
         */
        protected void setup(Context context) throws IOException, InterruptedException {
        	Configuration conf = context.getConfiguration();
            if (conf.getBoolean("debug.on", false)) {
             	LOG.setLevel(Level.DEBUG);
             	System.out.println("in debug mode");
            }

            fieldDelim = conf.get("field.delim", ",");
        	subFieldDelim = conf.get("sub.field.delim", ":");
        	String ratingFilePrefix = conf.get("utp.rating.file.prefix", "rating");
        	isRatingFileSplit = ((FileSplit)context.getInputSplit()).getPath().getName().startsWith(ratingFilePrefix);
        	String ratingStatFilePrefix = conf.get("utp.rating.stat.file.prefix", "stat");
        	isRatingStatFileSplit = ((FileSplit)context.getInputSplit()).getPath().getName().startsWith(ratingStatFilePrefix);
        	
        	linearCorrelation = conf.getBoolean("utp.correlation.linear", true);
        	int ratingTimeWindow = conf.getInt("utp.rating.time.window.hour",  -1);
        	ratingTimeCutoff = ratingTimeWindow > 0 ?  System.currentTimeMillis() / 1000 - ratingTimeWindow * 60L * 60L : -1;
        	
        	minInputRating = conf.getInt("utp.min.input.rating",  -1);
        	minCorrelation = conf.getInt("utp.min.correlation",  -1);
        	
        	userRatingWithContext = conf.getBoolean("utp.user.rating.with.context", false);
        	LOG.info("isRatingFileSplit:" + isRatingFileSplit);
        }    
    	
        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
         */
        @Override
        protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
        	String[] items = value.toString().split(fieldDelim);
    		String itemID = items[0];
        	if (isRatingFileSplit) {
        		//user rating
				context.getCounter("Record type count", "Rating").increment(1);

        		boolean toInclude = true;
               	for (int i = 1; i < items.length; ++i) {
               		//all user ratings for this item
            		ratings = items[i].split(subFieldDelim);
            		
            		//time sensitive recommendation
            		 toInclude = true; 
            		 if (ratingTimeCutoff > 0) {
            			 timeStamp = Long.parseLong(ratings[2]);
            			 toInclude = timeStamp > ratingTimeCutoff;
            		 }
            
            		 //contextual recommendation
            		 if (userRatingWithContext) {
            			 ratingContext = ratings[3];
            		 }
            		 
            		 //check for min input rating threshold
           			 inputRating = new Integer(ratings[1]);
            		toInclude = toInclude && inputRating > minInputRating;
            		 
            		 if (toInclude) {
	            		//itemID
	            		keyOut.set(itemID, two);
	            		
	            		//userID, rating
	               		valOut.initialize();
	            		 if (userRatingWithContext) {
	            			 valOut.add(ratings[0],  inputRating, context, two);
	            		 } else {
	            			 valOut.add(ratings[0],  inputRating,  two);
	            		 }
	       	   			context.write(keyOut, valOut);
            		 }
               	}
        	} else  if (isRatingStatFileSplit) {
        		//rating stat
				context.getCounter("Record type count", "Rating stat").increment(1);
        		int ratingStdDev = Integer.parseInt(items[STD_DEV_ORD]);
        		keyOut.set(itemID, one);
           		valOut.initialize();
        		valOut.add(ratingStdDev,   one);
   	   			context.write(keyOut, valOut);
        	} else {
        		//item correlation
				context.getCounter("Record type count", "Correlation").increment(1);
        		correlation =  Integer.parseInt( items[2]);
        		correlationLength =  Integer.parseInt(items[3]);

        		//if correlation is above min threshold
        		if (correlation > minCorrelation) {
	        		//correlation of 1st item
	        		keyOut.set(items[0], zero);
	        		valOut.initialize();
	   	   			if (linearCorrelation) {
	   	   				//other itemID, correlation, intersection length (weight)
	   	   				valOut.add(items[1], correlation, correlationLength, zero);
	   	   			} else {
	   	   				//other itemID, correlation, intersection length (weight)
	   	   				valOut.add(items[1], -correlation, correlationLength, zero);
	   	   			}
	   	   			context.write(keyOut, valOut);
	
	   	   			//correlation of second item
	   	   			keyOut.set(items[1], zero);
	        		valOut.initialize();
	   	   			if (linearCorrelation) {
	   	   				//other itemID, correlation, intersection length (weight)
	   	   				valOut.add(items[0], correlation, correlationLength, zero);
	   	   			} else {
	   	   				//other itemID, correlation, intersection length (weight)
	   	   				valOut.add(items[0], -correlation, correlationLength, zero);
	   	   			}
	   	   			context.write(keyOut, valOut);
        		}
        	}
        }
    }    

    /**
     * @author pranab
     *
     */
    public static class PredictorReducer extends Reducer<TextInt, Tuple, NullWritable, Text> {
    	private String fieldDelim;
    	private Text valueOut = new Text();
    	private List<Tuple> ratingCorrelations = new ArrayList<Tuple>();
    	private boolean linearCorrelation;
    	private int correlationScale;
    	private int maxRating;
    	private String userID;
    	private String itemID;
    	private int rating;
    	private int ratingCorr;
    	private int weight;
    	private long logCounter = 0;
    	private double correlationModifier;
    	private Tuple ratingStat;
    	private int ratingStdDev;
    	private boolean userRatingWithContext;
    	private String ratingContext;
    	
        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Reducer#setup(org.apache.hadoop.mapreduce.Reducer.Context)
         */
        protected void setup(Context context) throws IOException, InterruptedException {
        	Configuration conf = context.getConfiguration();
        	fieldDelim = conf.get("field.delim", ",");
        	linearCorrelation = conf.getBoolean("utp.correlation.linear", true);
        	correlationScale = conf.getInt("utp.correlation.linear.scale", 1000);
           	maxRating = conf.getInt("utp.max.rating", 100);
           	correlationModifier = conf.getFloat("utp.correlation.modifier", (float)1.0);
        	userRatingWithContext = conf.getBoolean("utp.user.rating.with.context", false);
        } 	
        
        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
         */
        protected void reduce(TextInt  key, Iterable<Tuple> values, Context context)
        throws IOException, InterruptedException {
        	ratingCorrelations.clear();
        	++logCounter;
        	ratingStat = null;
           	for(Tuple value : values) {
           		if ( ((Integer)value.get(value.getSize()-1)) == 0) {
           			//in rating correlation
           			ratingCorrelations.add(value.createClone());
					context.getCounter("Predictor", "Rating correlation").increment(1);
           		} else if  ( ((Integer)value.get(value.getSize()-1)) == 1 )  {
           			//rating stat
           			ratingStat = value.createClone();
           		} else {
           			//in user rating
           			if (!ratingCorrelations.isEmpty()) {
	           			String userID = value.getString(0);
	           			rating = value.getInt(1);
	           			if (userRatingWithContext) {
	           				ratingContext = value.getString(2);
	           			}
	           			
	    				//all rating correlations
	           			for (Tuple  ratingCorrTup : ratingCorrelations) { 
        					context.getCounter("Predictor", "User rating").increment(1);
	           				itemID = ratingCorrTup.getString(0);
	           				ratingCorr = ratingCorrTup.getInt(1);
	           				weight = ratingCorrTup.getInt(2);
	           				
	           				modifyCorrelation();
	           				int predRating = linearCorrelation? (rating * ratingCorr) / maxRating : 
	           					(rating  * correlationScale + ratingCorr) /maxRating ;
	           				if (predRating > 0) {
	           					//userID, itemID, predicted rating, correlation length, correlation coeff, input rating std dev
	           					ratingStdDev = ratingStat != null ? ratingStat.getInt(0) :  -1;
	    	           			if (userRatingWithContext) {
	    	           				valueOut.set(userID + fieldDelim + itemID + fieldDelim + ratingContext + fieldDelim + 
	    	           					predRating + fieldDelim + weight + fieldDelim +ratingCorr  + fieldDelim + ratingStdDev);
	    	           			} else {
	    	           				valueOut.set(userID + fieldDelim + itemID + fieldDelim + predRating + fieldDelim + weight + 
	           							fieldDelim +ratingCorr  + fieldDelim + ratingStdDev);
	    	           			}
	           					context.write(NullWritable.get(), valueOut);
	        					context.getCounter("Predictor", "Rating correlation").increment(1);
	           				}
	           			}
           			}
           		}
           	}        	
        }
        
        
        /**
         * 
         */
        private void modifyCorrelation() {
        	double ratingCorrDb  =( (double)ratingCorr) / correlationScale;
        	ratingCorrDb = Math.pow(ratingCorrDb, correlationModifier);
        	ratingCorr = (int)(ratingCorrDb * correlationScale);
        }
    }
    
    /**
     * @author pranab
     *
     */
    public static class ItemIdPartitioner extends Partitioner<TextInt, Tuple> {
	     @Override
	     public int getPartition(TextInt key, Tuple value, int numPartitions) {
	    	 //consider only base part of  key
		     return key.baseHashCode()% numPartitions;
	     }
   
   }

    /**
     * @author pranab
     *
     */
    public static class ItemIdGroupComprator extends WritableComparator {
    	protected ItemIdGroupComprator() {
    		super(TextInt.class, true);
    	}

    	@Override
    	public int compare(WritableComparable w1, WritableComparable w2) {
    		//consider only the base part of the key
    		TextInt t1 = ((TextInt)w1);
    		TextInt t2 = ((TextInt)w2);
    		return t1.baseCompareTo(t2);
    	}
     }
    
    /**
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new UtilityPredictor(), args);
        System.exit(exitCode);
    }
   
}