NewItemUtility.java example

Explorer

sifarish-master
- src
  - main
    - java
      - org
        sifarish
        common
        AttributeBasedDiversifier.java
        BusinessGoalInjector.java
        CompactRatingFormatter.java
        CorrelationMatrixBuilder.java
        EngagementScore.java
        EngagementToPreferenceMapper.java
        GlobalNovelty.java
        ImplicitRatingEstimator.java
        IndividualNovelty.java
        ItemDynamicAttributeSimilarity.java
        ItemEngagementDistr.java
        ItemRatingAttributeAggregator.java
        NewItemUtility.java
        PositiveFeedbackBasedRankReorderer.java
        RatedItem.java
        RatedItemWithAttributes.java
        RatingBlender.java
        ResourceDescribedEntity.java
        TaggedEntity.java
        TextAnalyzer.java
        UtilityAggregator.java
        UtilityPredictor.java
        etl
        CountryStandardFormat.java
        StructuredTextAnalyzer.java
        StructuredTextNormalizer.java
        TextFieldTokenNormalizer.java
        UnitedStatesStandardFormat.java
        feature
        CharacterPairSimilarity.java
        CosineSimilarity.java
        DiceSimilarity.java
        DiffTypeSimilarity.java
        DistanceStrategy.java
        DynamicAttrSimilarityStrategy.java
        EditDistanceSimilarity.java
        EuclideanDistance.java
        JaccardSimilarity.java
        ManhattanDistance.java
        MinkwoskiDistance.java
        MixedTypeSchema.java
        NearestNeighborClassifier.java
        ProfileItemSimilarity.java
        RecordDistanceFinder.java
        SameTypeSimilarity.java
        SemanticSimilarity.java
        SingleTypeSchema.java
        TextIntInt.java
        TextIntPair.java
        TopMatches.java
        TypeSchema.java
        VotingClassifier.java
        realtime
        DitheringBolt.java
        DitheringSpout.java
        DitheringTopology.java
        RealtimeUtil.java
        RecommenderBolt.java
        RecommenderTopology.java
        RedisSpout.java
        TrendingAggregateBolt.java
        TrendingSketchesBolt.java
        TrendingSpout.java
        TrendingTopology.java
        UserItemRatings.java
        social
        ItemRatingStat.java
        PearsonCorrelator.java
        RatingDifference.java
        RatingPredictor.java
        SlopeOneRating.java
        util
        CategoricalDistance.java
        ConceptHierarchy.java
        Entity.java
        Event.java
        Field.java
        FieldExtractor.java
        FieldMapping.java
        HourWindow.java
        IDistanceStrategy.java
        Location.java
        MatchingProfile.java
        StructuredAttribute.java
        TimeWindow.java
        Utility.java

/*
 * Sifarish: Recommendation Engine
 * Author: Pranab Ghosh
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0 
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */
package org.sifarish.common;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.chombo.util.SecondarySort;
import org.chombo.util.Tuple;
import org.chombo.util.Utility;
import org.sifarish.feature.RecordDistanceFinder;
import org.sifarish.feature.SingleTypeSchema;

/**
 * Solves cold start problem with new items. Finds predicted rating based on items recommended
 * for an user and content based correlation of those items with the new items
 * @author pranab
 *
 */
public class NewItemUtility extends Configured implements Tool{
    @Override
    public int run(String[] args) throws Exception   {
        Job job = new Job(getConf());
        String jobName = "new item utility estimator  MR";
        job.setJobName(jobName);
        
        job.setJarByClass(NewItemUtility.class);
        
        FileInputFormat.addInputPaths(job, args[0]);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        job.setMapperClass(NewItemUtility.ItemUtilityMapper.class);
        job.setReducerClass(NewItemUtility.ItemUtilityReducer.class);
        
        job.setMapOutputKeyClass(Tuple.class);
        job.setMapOutputValueClass(Tuple.class);

        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);
 
        job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
        job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

        Utility.setConfiguration(job.getConfiguration());
        int numReducer = job.getConfiguration().getInt("niu.num.reducer", -1);
        numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
        job.setNumReduceTasks(numReducer);
        
        int status =  job.waitForCompletion(true) ? 0 : 1;
        return status;
    }
    
    /**
     * @author pranab
     *
     */
    public static class ItemUtilityMapper extends Mapper<LongWritable, Text, Tuple, Tuple> {
    	private String fieldDelimRegex;
    	private Tuple keyOut = new Tuple();
    	private Tuple valOut = new Tuple();
    	private boolean isMetaDataFileSplit;
    	private String userD;
    	private String attrs;
    	private int hashBucketCount;
    	private String itemID;
    	private String userID;
    	private String[] items;
    	private int[] attrOrdinals;
    	
        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context)
         */
        protected void setup(Context context) throws IOException, InterruptedException {
        	Configuration config = context.getConfiguration();
        	fieldDelimRegex = config.get("field.delim.regex", ",");
        	String metaDataFilePrefix = config.get("niu.new.item.metadta.file.prefix", "new");
        	isMetaDataFileSplit = ((FileSplit)context.getInputSplit()).getPath().getName().startsWith(metaDataFilePrefix);
        	hashBucketCount = config.getInt("niu.hash.bucket.count", 16);
        	if (null != config.get("niu.item.attr.ordinals")) {
        		attrOrdinals = Utility.intArrayFromString(config.get("niu.item.attr.ordinals"));
        	}

        }

        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
         */
        @Override
        protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
           	items = value.toString().split(fieldDelimRegex);
          	if (isMetaDataFileSplit) {
          		//item meta data
          		itemID = items[0];
          		for (int i =0; i < hashBucketCount; ++i) {
                   	keyOut.initialize();
                   	keyOut.add(i, 1);
                   	
          			valOut.initialize();
          			valOut.add(1, itemID);
          			
               		if (null != attrOrdinals) {
               			//selected attributes
               			for (int ordinal :  attrOrdinals) {
               				valOut.append(items[ordinal]);
               			}
               		} else {
               			//all attributes
               			for (int ordinal = 1; ordinal < items.length; ++ordinal) {
               				valOut.append(items[ordinal]);
               			}
               		}
               		context.write(keyOut, valOut);
          		}

          	} else {
          		//user item rating aggregated with item meta data
          		userID = items[0];
          		int hash = userID.hashCode();
          		int bucket =  (hash < 0 ?  -hash :  hash) % hashBucketCount;
               	keyOut.initialize();
               	keyOut.add(bucket, 0);
               	
               	valOut.initialize();
               	for (int i = 0; i < items.length; ++i) {
               		if (i  == items.length -1) {
               			valOut.append(Integer.parseInt(items[i]));
               		} else {
               			valOut.append(items[i]);
               		}
               	}
          		
           		context.write(keyOut, valOut);
          	}
        }    
    }
    
    /**
     * @author pranab
     *
     */
    public static class ItemUtilityReducer extends Reducer<Tuple, Tuple, NullWritable, Text> {
    	private String fieldDelim;
    	private Text valOut = new Text();
    	private String userID;
    	private String itemID;
    	private int rating;
    	private Map<String, List<RatedItemWithAttributes>> itemsForUsers = new HashMap<String, List<RatedItemWithAttributes>>();
    	private Map<String, List<RatedItem>> newItemsForUsers = new HashMap<String, List<RatedItem>>();
    	private List<RatedItemWithAttributes> newItems = new ArrayList<RatedItemWithAttributes>();
    	private String[] attrs;
    	private SingleTypeSchema schema;
    	private int scale;
    	private int  distThreshold;
    	private RecordDistanceFinder distFinder;
    	private int[] newItemPredRatings;
    	private String ratingAggrStrategy;
    	private StringBuilder stBld =  new StringBuilder();
 		private static final Logger LOG = Logger.getLogger(NewItemUtility.ItemUtilityReducer.class);

    	/* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Reducer#setup(org.apache.hadoop.mapreduce.Reducer.Context)
         */
        protected void setup(Context context) throws IOException, InterruptedException {
        	Configuration config = context.getConfiguration();
            if (config.getBoolean("debug.on", false)) {
             	LOG.setLevel(Level.DEBUG);
             	System.out.println("in debug mode");
            }

        	fieldDelim = config.get("field.delim", ",");
        	try {
				schema = org.sifarish.util.Utility.getSameTypeSchema(config);
			} catch (Exception e) {
				throw new IOException("failed to process schema " + e.getMessage());
			}
        	scale = config.getInt("niu.distance.scale", 1000);
        	distThreshold = config.getInt("niu.dist.threshold", scale);
        	distFinder = new RecordDistanceFinder(config.get("field.delim.regex", ","),  0, scale,
        			distThreshold, schema, ":");
        	ratingAggrStrategy = config.get("niu.new.item.rating.aggregator.strategy", "average");
        	
        }
   	
        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
         */
        protected void reduce(Tuple  key, Iterable<Tuple> values, Context context)
        throws IOException, InterruptedException {
        	userID = key.getString(0);
        	itemsForUsers.clear();
        	newItems.clear();
        	newItemsForUsers.clear();
        	
        	for(Tuple value : values) {
        		int type = value.getInt(0);
        		if (0 == type) {
        			//predicted ratings with item attributes
        			userID = value.getString(0);
        			int size = value.getSize();
        			rating = value.getInt(size -1);
        			attrs = value.subTupleAsArray(1, size-1);
        			List<RatedItemWithAttributes> items = itemsForUsers.get(userID);
        			if (null == items) {
        				items = new  ArrayList<RatedItemWithAttributes>();
        				itemsForUsers.put(userID, items);
        			}
        			items.add(new RatedItemWithAttributes(value.getString(1),  rating, attrs));
        		} else {
        			//new item attributes
        			attrs = value.getTupleAsArray();
        			newItems.add(new RatedItemWithAttributes(value.getString(0),  0, attrs));
        		}
        	}
        	
        	//all users
        	 for (String userID : itemsForUsers.keySet()) {
        		 //new items
        		 for (RatedItemWithAttributes newItem : newItems) {
        			 List<RatedItemWithAttributes> items = itemsForUsers.get(userID);
        			 newItemPredRatings = new int[items.size()];
        			 int i = 0;
        			 //items or an user
        			 for (RatedItemWithAttributes item : items) {
        				 newItemPredRatings[i++] = 
        						 (scale - distFinder.findDistance(item.getAttributeArray(), newItem.getAttributeArray())) * item.getRight();
        			 }
        			 int aggrRating = aggregateRating();
        			 List<RatedItem> ratedNewItems = newItemsForUsers.get(userID);
        			 if (null == userID) {
        				 ratedNewItems = new ArrayList<RatedItem>();
        				 newItemsForUsers.put(userID, ratedNewItems);
        			 }
        			 ratedNewItems.add(new RatedItem(newItem.getLeft(),  aggrRating ));
        		 }
        	 }
        	 
        	 //output ratings
        	 outputRating(context);
        }
        
       /**
     * @return
     */
        private int aggregateRating() {
        	int aggrRating = 0;
        	if (ratingAggrStrategy.equals("average")) {
        		int sum = 0;
    		    for (int rating : newItemPredRatings) {
    		    	sum += rating;
    		    }
    		    aggrRating = sum / (newItemPredRatings.length * scale);
    	   } else if  (ratingAggrStrategy.equals("average")) {
    		   int max = -1;
   		    	for (int rating : newItemPredRatings) {
   		    		if (rating > max) {
   		    			max = rating;
   		    		}
   		    	}    		   
   		    	aggrRating = max;
    	   } else {
    		   throw new IllegalArgumentException("invalid rating aggregation function");
    	   }
    	   
    	   return aggrRating;
        }
        
    
    /**
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
        private void outputRating(Context context) throws IOException, InterruptedException {
        	//all users
        	for (String userID : itemsForUsers.keySet()) {
        		List<RatedItemWithAttributes> items = itemsForUsers.get(userID);
    		
        		//existing items
        		for (RatedItemWithAttributes item : items) {
    	    		stBld.delete(0, stBld.length());
    	    		stBld.append(userID).append(fieldDelim).append(item.getLeft()).append(fieldDelim).
    	    			append(item.getRight()).append("E");
    	    		valOut.set(stBld.toString());
    	    		context.write(NullWritable.get(), valOut);
        		}
    		 
        		//new items
        		List<RatedItem> newItems = newItemsForUsers.get(userID);
        		for (RatedItem item : newItems) {
        			stBld.delete(0, stBld.length());
        			stBld.append(userID).append(fieldDelim).append(item.getLeft()).append(fieldDelim).
   	    				append(item.getRight()).append("N");
        			valOut.set(stBld.toString());
        			context.write(NullWritable.get(), valOut);
        		}
    		 
   	 		}
        }
        
        
    }
    
    /**
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new NewItemUtility(), args);
        System.exit(exitCode);
    }
    
}