ItemDynamicAttributeSimilarity.java example

Explorer

SIF-master
- sifarish-master
  - src
    - main
      - java
        org
        sifarish
        common
        AttributeBasedDiversifier.java
        BusinessGoalInjector.java
        CompactRatingFormatter.java
        CorrelationMatrixBuilder.java
        EngagementScore.java
        EngagementToPreferenceMapper.java
        GlobalNovelty.java
        ImplicitRatingEstimator.java
        IndividualNovelty.java
        ItemDynamicAttributeSimilarity.java
        ItemEngagementDistr.java
        ItemRatingAttributeAggregator.java
        NewItemUtility.java
        PositiveFeedbackBasedRankReorderer.java
        RatedItem.java
        RatedItemWithAttributes.java
        RatingBlender.java
        ResourceDescribedEntity.java
        TaggedEntity.java
        TextAnalyzer.java
        UtilityAggregator.java
        UtilityPredictor.java
        etl
        CountryStandardFormat.java
        StructuredTextAnalyzer.java
        StructuredTextNormalizer.java
        TextFieldTokenNormalizer.java
        UnitedStatesStandardFormat.java
        feature
        CharacterPairSimilarity.java
        CosineSimilarity.java
        DiceSimilarity.java
        DiffTypeSimilarity.java
        DistanceStrategy.java
        DynamicAttrSimilarityStrategy.java
        EditDistanceSimilarity.java
        EuclideanDistance.java
        JaccardSimilarity.java
        ManhattanDistance.java
        MinkwoskiDistance.java
        MixedTypeSchema.java
        NearestNeighborClassifier.java
        ProfileItemSimilarity.java
        RecordDistanceFinder.java
        SameTypeSimilarity.java
        SemanticSimilarity.java
        SingleTypeSchema.java
        TextIntInt.java
        TextIntPair.java
        TopMatches.java
        TypeSchema.java
        VotingClassifier.java
        realtime
        DitheringBolt.java
        DitheringSpout.java
        DitheringTopology.java
        RealtimeUtil.java
        RecommenderBolt.java
        RecommenderTopology.java
        RedisSpout.java
        TrendingAggregateBolt.java
        TrendingSketchesBolt.java
        TrendingSpout.java
        TrendingTopology.java
        UserItemRatings.java
        social
        ItemRatingStat.java
        PearsonCorrelator.java
        RatingDifference.java
        RatingPredictor.java
        SlopeOneRating.java
        util
        CategoricalDistance.java
        ConceptHierarchy.java
        Entity.java
        Event.java
        Field.java
        FieldExtractor.java
        FieldMapping.java
        HourWindow.java
        IDistanceStrategy.java
        Location.java
        MatchingProfile.java
        StructuredAttribute.java
        TimeWindow.java
        Utility.java

/*
 * Sifarish: Recommendation Engine
 * Author: Pranab Ghosh
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0 
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package org.sifarish.common;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.chombo.util.IntPair;
import org.chombo.util.Tuple;
import org.chombo.util.Utility;
import org.sifarish.feature.DynamicAttrSimilarityStrategy;

/**
 * Mapreduce for finding similarities between items with dynamic set of attributes. For example,  products 
 * where the atrributes   are users who have purchased it or documents where the attributes are terms in
 * the documents.
 * 
 * @author pranab
 *
 */
public class ItemDynamicAttributeSimilarity  extends Configured implements Tool{
    @Override
    public int run(String[] args) throws Exception   {
        Job job = new Job(getConf());
        String jobName = "Item with dynamic attribute  similarity MR";
        job.setJobName(jobName);
        
        job.setJarByClass(ItemDynamicAttributeSimilarity.class);
        
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        job.setMapperClass(ItemDynamicAttributeSimilarity.SimilarityMapper.class);
        job.setReducerClass(ItemDynamicAttributeSimilarity.SimilarityReducer.class);
        
        job.setMapOutputKeyClass(Tuple.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);
 
        job.setGroupingComparatorClass(IdPairGroupComprator.class);
        job.setPartitionerClass(IdPairPartitioner.class);

        Utility.setConfiguration(job.getConfiguration());

        int numReducer = job.getConfiguration().getInt("idas.num.reducer", -1);
        numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
        job.setNumReduceTasks(numReducer);
        
        int status =  job.waitForCompletion(true) ? 0 : 1;
        return status;
    }

    /**
     * @author pranab
     *
     */
    public static class SimilarityMapper extends Mapper<LongWritable, Text, Tuple, Text> {
        private int bucketCount;
        private int hash;
        private String fieldDelimRegex;
        private Integer hashPair;
        private Integer one = 1;
        private Integer zero = 0;
        private String itemID;
        private Tuple keyHolder = new Tuple();
        private Text valueHolder = new Text();
        private int hashPairMult;
        private int hashCode;
        private int partitonFieldOrdinal;
        private static final Logger LOG = Logger.getLogger(ItemDynamicAttributeSimilarity.SimilarityMapper.class);
    	
        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context)
         */
        protected void setup(Context context) throws IOException, InterruptedException {
			Configuration conf = context.getConfiguration();
            if (conf.getBoolean("debug.on", false)) {
             	LOG.setLevel(Level.DEBUG);
             	System.out.println("in debug mode");
            }
        	bucketCount = conf.getInt("idas.bucket.count", 10);
        	fieldDelimRegex = conf.get("field.delim.regex", "\\[\\]");
        	hashPairMult = conf.getInt("idas.hash.pair.multiplier", 1000);
        	partitonFieldOrdinal = conf.getInt("idas.paritioning.field.ordinal", -1);
        }    
        
        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
         */
        @Override
        protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
        	//first token is entity ID and the rest list attributes
        	String[] items  =  value.toString().split(fieldDelimRegex);
        	itemID  =  items[0];
        	hashCode = itemID.hashCode();
        	if (hashCode < 0) {
        		hashCode = - hashCode;
        	}
    		hash = (hashCode %  bucketCount) / 2 ;
    		String partition = partitonFieldOrdinal >= 0 ? items[partitonFieldOrdinal] :  "none";
    		
    		for (int i = 0; i < bucketCount;  ++i) {
    			keyHolder.initialize();
    			if (i < hash){
       				hashPair = hash * hashPairMult +  i;
       				keyHolder.add(partition, hashPair, zero);
       				valueHolder.set("0" + value.toString());
       	   		 } else {
    				hashPair =  i * hashPairMult  +  hash;
       				keyHolder.add(partition, hashPair, one);
       				valueHolder.set("1" + value.toString());
    			} 
    			//System.out.println("mapper hashPair: " + hashPair);
   	   			context.write(keyHolder, valueHolder);
    		}
        }
             
    }
    
    /**
     * @author pranab
     *
     */
    public static class SimilarityReducer extends Reducer<Tuple, Text, NullWritable, Text> {
        private Text valueHolder = new Text();
        private String fieldDelim;
    	private String fieldDelimRegex;
        private int delimLength;
        private int hashPairMult;
        private List<String[]> valueList = new ArrayList<String[]>();
        private DynamicAttrSimilarityStrategy simStrategy;
        private int scale;
        private boolean outputCorrelation;
        private int partitonFieldOrdinal;
        private int intLength;
        private int minIntLength;
        private boolean addMatchingContext;
        private int semanticScale;
       	private StringBuilder stBld = new StringBuilder();
        private static final Logger LOG = Logger.getLogger(ItemDynamicAttributeSimilarity.SimilarityReducer.class);
               
        
        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Reducer#setup(org.apache.hadoop.mapreduce.Reducer.Context)
         */
        protected void setup(Context context) throws IOException, InterruptedException {
        	Configuration conf = context.getConfiguration();
            if (conf.getBoolean("debug.on", false)) {
             	LOG.setLevel(Level.DEBUG);
             	System.out.println("in debug mode");
            }
        	
        	fieldDelim = conf.get("field.delim", "[]");
        	fieldDelimRegex = conf.get("field.delim.regex", "\\[\\]");
        	delimLength =  fieldDelim.length();
        	hashPairMult = conf.getInt("idas.hash.pair.multiplier", 1000);
        	String simAlgorithm = conf.get("idas.similarity.algorithm", "cosine");
        	
        	//semantic matching
        	Map<String, Object> params = new HashMap<String, Object>();
        	params.put("matcherClass", conf.get("idas.semantic.matcher.class"));
        	params.put("topMatchCount", conf.getInt("idas.semantic.top.match.count", 5));
        	params.put("semanticScale", conf.getInt("idas.semantic.match.scale", 10));
        	params.put("config", conf);
        	loadSemanticMatcherParams( conf,  params); 
        	
        	LOG.debug("simAlgorithm:" + simAlgorithm + " matcherClass: "  + conf.get("idas.semantic.matcher.class"));
        	
        	//similarity matching algorithm
        	params.put("srcNonMatchingTermWeight", conf.get("idas.jaccard.srcNonMatchingTermWeight"));
        	params.put("trgNonMatchingTermWeight", conf.get("idas.jaccard.trgNonMatchingTermWeight"));
        	simStrategy = DynamicAttrSimilarityStrategy.createSimilarityStrategy(simAlgorithm, params);
        	
        	simStrategy.setFieldDelimRegex(fieldDelimRegex);
        	boolean booleanVec = conf.getBoolean("idas.vec.type.boolean", true);
        	boolean semanticVec = conf.getBoolean("idas.vec.type.semantic", false);
        	LOG.debug("booleanVec:" + booleanVec + " semanticVec:" + semanticVec);
        	addMatchingContext = conf.getBoolean("idas.add.semantic.matching.context", false);
        	
        	//vector type
        	if (booleanVec){
        		simStrategy.setBooleanVec(booleanVec);
        	}
        	if (semanticVec){
        		simStrategy.setSemanticVec(semanticVec);
        	}
        	if (!booleanVec && !semanticVec) {
            	boolean countIncluded = conf.getBoolean("idas.vec.count.included", true);
        		simStrategy.setCountIncluded(countIncluded);
        	}
        	
           	scale = conf.getInt("idas.distance.scale", 1000);
           	outputCorrelation = conf.getBoolean("idas.output.correlation", false);
           	partitonFieldOrdinal = conf.getInt("idas.paritioning.field.ordinal", -1);
           	minIntLength =  conf.getInt("idas.min.intersection.length", 2);
           	LOG.debug("outputCorrelation:" + outputCorrelation + " partitonFieldOrdinal:" + partitonFieldOrdinal +
           			" minIntLength:" + minIntLength);
          }    
        
        /**
         * @param conf
         * @param params
         */
        private void loadSemanticMatcherParams(Configuration conf, Map<String, Object> params ) {
        	String semParams = conf.get("idas.semantic.matcher.params");
        	if (!StringUtils.isBlank(semParams)) {
	        	String[] semanticParams = semParams.split(",");
	        	for (String semanticParam :  semanticParams) {
	        		params.put(semanticParam, conf.get(semanticParam));
	        	}
        	}
        	
        }

        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
         */
        protected void reduce(Tuple  key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {
        	double dist = 0;
        	intLength = -1;
        	
        	valueList.clear();
        	int firstPart = key.getInt(1);
        	//System.out.println("hashPair: " + firstPart);
        	if (firstPart / hashPairMult == firstPart % hashPairMult){
        		//same hash bucket
    			context.getCounter("Reducer", "Same  Bucket Count").increment(1);
    			//System.out.println("**same bucket");
    			
	        	for (Text value : values){
	        		String valSt = value.toString();
        			String[] parts = splitKey(valSt.substring(1));
        			valueList.add(parts);
	        	}   
	        	
	        	for (int i = 0;  i < valueList.size();  ++i){
	        		String[] firstParts = valueList.get(i);
	        		for (int j = i+1;  j < valueList.size();  ++j) {
		        		String[] secondParts = valueList.get(j);
		        		//process 2 user vectors
	        			dist = ( 1.0 - simStrategy.findDistance(firstParts[1], secondParts[1]))  * scale;
	        			dist = dist < 0.0 ? 0.0 : dist;
		        		
    					intLength = simStrategy.getIntersectionLength();
    					if( intLength >= minIntLength || simStrategy.isSemanticVec()) {
	        				if (outputCorrelation) {
	        					dist = scale - dist;
	            				//2 items IDs followed by distance and intersection length
	        					stBld.append(firstParts[0]).append(fieldDelim).append(secondParts[0]).append(fieldDelim).
	        						append( (int)dist).append(fieldDelim).append(intLength);
	        				} else {
	            				//2 items IDs followed by distance
	    	   					stBld.append(firstParts[0]).append(fieldDelim).append(secondParts[0]).append(fieldDelim).
	    	   						append( (int)dist);
	        				}
	        				
	        				//if there any matching context data
	        				if(addMatchingContext) {
	        					appendMatchingContexts(stBld);
	        				}
	
	        				valueHolder.set(stBld.toString());
		   	    			context.getCounter("Reducer", "Emit").increment(1);
		   					context.write(NullWritable.get(), valueHolder);
		        			stBld.delete(0, stBld.length());
	    				} else {
		   	    			context.getCounter("Correlation Intersection", "Below threshold").increment(1);
	    				} //if int length
	        		}//for
	        	}//for
        	} else {
        		//different hash bucket
    			context.getCounter("Reducer", "Diff Bucket Count").increment(1);
    			//System.out.println("**diff  bucket");
	        	for (Text value : values){
	        		String valSt = value.toString();
	        		if (valSt.startsWith("0")) {
	        			String[] parts = splitKey(valSt.substring(1));
	        			valueList.add(parts);
	        		} else {
	        			String[] parts = splitKey(valSt.substring(1));
	        			
	        			//match with all items of first set
	        			for (String[] firstParts : valueList) {
	        				//process 2 entity vectors
	        				dist = (1.0 - simStrategy.findDistance(firstParts[1], parts[1])) * scale;
	        				dist = dist < 0.0 ? 0.0 : dist;
	        				LOG.debug("dist:" + dist);
	        				
        					intLength = simStrategy.getIntersectionLength();
        					if( intLength >= minIntLength || simStrategy.isSemanticVec()) {
		        				if (outputCorrelation) {
		        					dist = scale - dist;
		            				//2 items IDs followed by distance and intersection l;ength
		           					stBld.append(firstParts[0]).append(fieldDelim).append(parts[0]).append(fieldDelim).
		           						append( (int)dist).append(fieldDelim).append(intLength);
	 	        				} else {
		            				//2 items IDs followed by distance
		    	   					stBld.append(firstParts[0]).append(fieldDelim).append(parts[0]).append(fieldDelim).
	    	   						append( (int)dist);
		        				}
	
		        				//if there any matching context data
		        				if(addMatchingContext) {
		        					appendMatchingContexts(stBld);
		        				}
	
		        				valueHolder.set(stBld.toString());
			   	    			context.getCounter("Reducer", "Emit").increment(1);
			   					context.write(NullWritable.get(), valueHolder);
			        			stBld.delete(0, stBld.length());
		        			} else {
			   	    			context.getCounter("Correlation Intersection", "Below threshold").increment(1);
		        			}//if int length
	        			}//for
	        		}//if
	        	}//for
        	}//if
       	
        }
        
        /**
         * @param stBld
         */
        private void appendMatchingContexts(StringBuilder stBld) {
			String[]   matchingContexts = simStrategy.getMatchingContexts();
			if (null  !=  matchingContexts) {
				for (String matchingContext :  matchingContexts) {
					stBld.append(fieldDelim).append(matchingContext);
				}
			}
        }
        
        
        /**
         * @param val
         * @return
         */
        private String[]   splitKey(String val) {
        	String[] parts = new String[2];
        	int pos = val.indexOf(fieldDelim);
        	
        	//entity ID
        	parts[0] = val.substring(0, pos);
        	
        	//list of attributes
        	if (partitonFieldOrdinal >= 0) {
        		//partitioning field in the second
        		val = val.substring( pos + delimLength);
        		pos = val.indexOf(fieldDelim);
        		parts[1] = val.substring( pos + delimLength);
        	} else {
        		//attributes from the second field onwards
        		parts[1] = val.substring( pos + delimLength);
        	}
        	return parts;
        }
        
    }
    
    /**
     * @author pranab
     *
     */
    public static class IdPairPartitioner extends Partitioner<Tuple, Text> {
	     @Override
	     public int getPartition(Tuple key, Text value, int numPartitions) {
	    	 //consider only base part of  key
		     return key.hashCodeBase() % numPartitions;
	     }
  
  }

    
    /**
     * @author pranab
     *
     */
    public static class IdPairGroupComprator extends WritableComparator {
    	protected IdPairGroupComprator() {
    		super(Tuple.class, true);
    	}

    	@Override
    	public int compare(WritableComparable w1, WritableComparable w2) {
    		//consider only the base part of the key
    		Tuple t1 = (Tuple)w1;
    		Tuple t2 = (Tuple)w2;
    		
    		int comp =t1.compareToBase(t2);
    		return comp;
    	}
     }
  
    /**
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new ItemDynamicAttributeSimilarity(), args);
        System.exit(exitCode);
    }

}