/*
* Sifarish: Recommendation Engine
* Author: Pranab Ghosh
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.sifarish.common;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.chombo.util.IntPair;
import org.chombo.util.Tuple;
import org.chombo.util.Utility;
import org.sifarish.feature.DynamicAttrSimilarityStrategy;
/**
* Mapreduce for finding similarities between items with dynamic set of attributes. For example, products
* where the atrributes are users who have purchased it or documents where the attributes are terms in
* the documents.
*
* @author pranab
*
*/
public class ItemDynamicAttributeSimilarity extends Configured implements Tool{
@Override
public int run(String[] args) throws Exception {
Job job = new Job(getConf());
String jobName = "Item with dynamic attribute similarity MR";
job.setJobName(jobName);
job.setJarByClass(ItemDynamicAttributeSimilarity.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(ItemDynamicAttributeSimilarity.SimilarityMapper.class);
job.setReducerClass(ItemDynamicAttributeSimilarity.SimilarityReducer.class);
job.setMapOutputKeyClass(Tuple.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.setGroupingComparatorClass(IdPairGroupComprator.class);
job.setPartitionerClass(IdPairPartitioner.class);
Utility.setConfiguration(job.getConfiguration());
int numReducer = job.getConfiguration().getInt("idas.num.reducer", -1);
numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
job.setNumReduceTasks(numReducer);
int status = job.waitForCompletion(true) ? 0 : 1;
return status;
}
/**
* @author pranab
*
*/
public static class SimilarityMapper extends Mapper<LongWritable, Text, Tuple, Text> {
private int bucketCount;
private int hash;
private String fieldDelimRegex;
private Integer hashPair;
private Integer one = 1;
private Integer zero = 0;
private String itemID;
private Tuple keyHolder = new Tuple();
private Text valueHolder = new Text();
private int hashPairMult;
private int hashCode;
private int partitonFieldOrdinal;
private static final Logger LOG = Logger.getLogger(ItemDynamicAttributeSimilarity.SimilarityMapper.class);
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context)
*/
protected void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
if (conf.getBoolean("debug.on", false)) {
LOG.setLevel(Level.DEBUG);
System.out.println("in debug mode");
}
bucketCount = conf.getInt("idas.bucket.count", 10);
fieldDelimRegex = conf.get("field.delim.regex", "\\[\\]");
hashPairMult = conf.getInt("idas.hash.pair.multiplier", 1000);
partitonFieldOrdinal = conf.getInt("idas.paritioning.field.ordinal", -1);
}
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
*/
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//first token is entity ID and the rest list attributes
String[] items = value.toString().split(fieldDelimRegex);
itemID = items[0];
hashCode = itemID.hashCode();
if (hashCode < 0) {
hashCode = - hashCode;
}
hash = (hashCode % bucketCount) / 2 ;
String partition = partitonFieldOrdinal >= 0 ? items[partitonFieldOrdinal] : "none";
for (int i = 0; i < bucketCount; ++i) {
keyHolder.initialize();
if (i < hash){
hashPair = hash * hashPairMult + i;
keyHolder.add(partition, hashPair, zero);
valueHolder.set("0" + value.toString());
} else {
hashPair = i * hashPairMult + hash;
keyHolder.add(partition, hashPair, one);
valueHolder.set("1" + value.toString());
}
//System.out.println("mapper hashPair: " + hashPair);
context.write(keyHolder, valueHolder);
}
}
}
/**
* @author pranab
*
*/
public static class SimilarityReducer extends Reducer<Tuple, Text, NullWritable, Text> {
private Text valueHolder = new Text();
private String fieldDelim;
private String fieldDelimRegex;
private int delimLength;
private int hashPairMult;
private List<String[]> valueList = new ArrayList<String[]>();
private DynamicAttrSimilarityStrategy simStrategy;
private int scale;
private boolean outputCorrelation;
private int partitonFieldOrdinal;
private int intLength;
private int minIntLength;
private boolean addMatchingContext;
private int semanticScale;
private StringBuilder stBld = new StringBuilder();
private static final Logger LOG = Logger.getLogger(ItemDynamicAttributeSimilarity.SimilarityReducer.class);
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Reducer#setup(org.apache.hadoop.mapreduce.Reducer.Context)
*/
protected void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
if (conf.getBoolean("debug.on", false)) {
LOG.setLevel(Level.DEBUG);
System.out.println("in debug mode");
}
fieldDelim = conf.get("field.delim", "[]");
fieldDelimRegex = conf.get("field.delim.regex", "\\[\\]");
delimLength = fieldDelim.length();
hashPairMult = conf.getInt("idas.hash.pair.multiplier", 1000);
String simAlgorithm = conf.get("idas.similarity.algorithm", "cosine");
//semantic matching
Map<String, Object> params = new HashMap<String, Object>();
params.put("matcherClass", conf.get("idas.semantic.matcher.class"));
params.put("topMatchCount", conf.getInt("idas.semantic.top.match.count", 5));
params.put("semanticScale", conf.getInt("idas.semantic.match.scale", 10));
params.put("config", conf);
loadSemanticMatcherParams( conf, params);
LOG.debug("simAlgorithm:" + simAlgorithm + " matcherClass: " + conf.get("idas.semantic.matcher.class"));
//similarity matching algorithm
params.put("srcNonMatchingTermWeight", conf.get("idas.jaccard.srcNonMatchingTermWeight"));
params.put("trgNonMatchingTermWeight", conf.get("idas.jaccard.trgNonMatchingTermWeight"));
simStrategy = DynamicAttrSimilarityStrategy.createSimilarityStrategy(simAlgorithm, params);
simStrategy.setFieldDelimRegex(fieldDelimRegex);
boolean booleanVec = conf.getBoolean("idas.vec.type.boolean", true);
boolean semanticVec = conf.getBoolean("idas.vec.type.semantic", false);
LOG.debug("booleanVec:" + booleanVec + " semanticVec:" + semanticVec);
addMatchingContext = conf.getBoolean("idas.add.semantic.matching.context", false);
//vector type
if (booleanVec){
simStrategy.setBooleanVec(booleanVec);
}
if (semanticVec){
simStrategy.setSemanticVec(semanticVec);
}
if (!booleanVec && !semanticVec) {
boolean countIncluded = conf.getBoolean("idas.vec.count.included", true);
simStrategy.setCountIncluded(countIncluded);
}
scale = conf.getInt("idas.distance.scale", 1000);
outputCorrelation = conf.getBoolean("idas.output.correlation", false);
partitonFieldOrdinal = conf.getInt("idas.paritioning.field.ordinal", -1);
minIntLength = conf.getInt("idas.min.intersection.length", 2);
LOG.debug("outputCorrelation:" + outputCorrelation + " partitonFieldOrdinal:" + partitonFieldOrdinal +
" minIntLength:" + minIntLength);
}
/**
* @param conf
* @param params
*/
private void loadSemanticMatcherParams(Configuration conf, Map<String, Object> params ) {
String semParams = conf.get("idas.semantic.matcher.params");
if (!StringUtils.isBlank(semParams)) {
String[] semanticParams = semParams.split(",");
for (String semanticParam : semanticParams) {
params.put(semanticParam, conf.get(semanticParam));
}
}
}
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
*/
protected void reduce(Tuple key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
double dist = 0;
intLength = -1;
valueList.clear();
int firstPart = key.getInt(1);
//System.out.println("hashPair: " + firstPart);
if (firstPart / hashPairMult == firstPart % hashPairMult){
//same hash bucket
context.getCounter("Reducer", "Same Bucket Count").increment(1);
//System.out.println("**same bucket");
for (Text value : values){
String valSt = value.toString();
String[] parts = splitKey(valSt.substring(1));
valueList.add(parts);
}
for (int i = 0; i < valueList.size(); ++i){
String[] firstParts = valueList.get(i);
for (int j = i+1; j < valueList.size(); ++j) {
String[] secondParts = valueList.get(j);
//process 2 user vectors
dist = ( 1.0 - simStrategy.findDistance(firstParts[1], secondParts[1])) * scale;
dist = dist < 0.0 ? 0.0 : dist;
intLength = simStrategy.getIntersectionLength();
if( intLength >= minIntLength || simStrategy.isSemanticVec()) {
if (outputCorrelation) {
dist = scale - dist;
//2 items IDs followed by distance and intersection length
stBld.append(firstParts[0]).append(fieldDelim).append(secondParts[0]).append(fieldDelim).
append( (int)dist).append(fieldDelim).append(intLength);
} else {
//2 items IDs followed by distance
stBld.append(firstParts[0]).append(fieldDelim).append(secondParts[0]).append(fieldDelim).
append( (int)dist);
}
//if there any matching context data
if(addMatchingContext) {
appendMatchingContexts(stBld);
}
valueHolder.set(stBld.toString());
context.getCounter("Reducer", "Emit").increment(1);
context.write(NullWritable.get(), valueHolder);
stBld.delete(0, stBld.length());
} else {
context.getCounter("Correlation Intersection", "Below threshold").increment(1);
} //if int length
}//for
}//for
} else {
//different hash bucket
context.getCounter("Reducer", "Diff Bucket Count").increment(1);
//System.out.println("**diff bucket");
for (Text value : values){
String valSt = value.toString();
if (valSt.startsWith("0")) {
String[] parts = splitKey(valSt.substring(1));
valueList.add(parts);
} else {
String[] parts = splitKey(valSt.substring(1));
//match with all items of first set
for (String[] firstParts : valueList) {
//process 2 entity vectors
dist = (1.0 - simStrategy.findDistance(firstParts[1], parts[1])) * scale;
dist = dist < 0.0 ? 0.0 : dist;
LOG.debug("dist:" + dist);
intLength = simStrategy.getIntersectionLength();
if( intLength >= minIntLength || simStrategy.isSemanticVec()) {
if (outputCorrelation) {
dist = scale - dist;
//2 items IDs followed by distance and intersection l;ength
stBld.append(firstParts[0]).append(fieldDelim).append(parts[0]).append(fieldDelim).
append( (int)dist).append(fieldDelim).append(intLength);
} else {
//2 items IDs followed by distance
stBld.append(firstParts[0]).append(fieldDelim).append(parts[0]).append(fieldDelim).
append( (int)dist);
}
//if there any matching context data
if(addMatchingContext) {
appendMatchingContexts(stBld);
}
valueHolder.set(stBld.toString());
context.getCounter("Reducer", "Emit").increment(1);
context.write(NullWritable.get(), valueHolder);
stBld.delete(0, stBld.length());
} else {
context.getCounter("Correlation Intersection", "Below threshold").increment(1);
}//if int length
}//for
}//if
}//for
}//if
}
/**
* @param stBld
*/
private void appendMatchingContexts(StringBuilder stBld) {
String[] matchingContexts = simStrategy.getMatchingContexts();
if (null != matchingContexts) {
for (String matchingContext : matchingContexts) {
stBld.append(fieldDelim).append(matchingContext);
}
}
}
/**
* @param val
* @return
*/
private String[] splitKey(String val) {
String[] parts = new String[2];
int pos = val.indexOf(fieldDelim);
//entity ID
parts[0] = val.substring(0, pos);
//list of attributes
if (partitonFieldOrdinal >= 0) {
//partitioning field in the second
val = val.substring( pos + delimLength);
pos = val.indexOf(fieldDelim);
parts[1] = val.substring( pos + delimLength);
} else {
//attributes from the second field onwards
parts[1] = val.substring( pos + delimLength);
}
return parts;
}
}
/**
* @author pranab
*
*/
public static class IdPairPartitioner extends Partitioner<Tuple, Text> {
@Override
public int getPartition(Tuple key, Text value, int numPartitions) {
//consider only base part of key
return key.hashCodeBase() % numPartitions;
}
}
/**
* @author pranab
*
*/
public static class IdPairGroupComprator extends WritableComparator {
protected IdPairGroupComprator() {
super(Tuple.class, true);
}
@Override
public int compare(WritableComparable w1, WritableComparable w2) {
//consider only the base part of the key
Tuple t1 = (Tuple)w1;
Tuple t2 = (Tuple)w2;
int comp =t1.compareToBase(t2);
return comp;
}
}
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new ItemDynamicAttributeSimilarity(), args);
System.exit(exitCode);
}
}