/*
* Sifarish: Recommendation Engine
* Author: Pranab Ghosh
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.sifarish.feature;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.codehaus.jackson.map.ObjectMapper;
import org.sifarish.util.Entity;
import org.sifarish.util.Field;
import org.sifarish.util.FieldMapping;
import org.sifarish.util.Utility;
/**
* Similarity between two different entity types based distance measure of attributes
* Attributes from the two different entity types are linked through meta data. Meta data
* for both entity types are defined in JSON
* @author pranab
*/
public class DiffTypeSimilarity extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Job job = new Job(getConf());
String jobName = "Dirfferent type entity similarity MR";
job.setJobName(jobName);
job.setJarByClass(DiffTypeSimilarity.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(DiffTypeSimilarity.SimilarityMapper.class);
job.setReducerClass(DiffTypeSimilarity.SimilarityReducer.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.setGroupingComparatorClass(IdPairGroupComprator.class);
job.setPartitionerClass(IdPairPartitioner.class);
Utility.setConfiguration(job.getConfiguration());
int numReducer = job.getConfiguration().getInt("dts.num.reducer", -1);
numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
job.setNumReduceTasks(numReducer);
int status = job.waitForCompletion(true) ? 0 : 1;
return status;
}
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new DiffTypeSimilarity(), args);
System.exit(exitCode);
}
/**
* @author pranab
*
*/
public static class SimilarityMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
private LongWritable keyHolder = new LongWritable();
private Text valueHolder = new Text();
private MixedTypeSchema schema;
private int bucketCount;
private long hash;
private int idOrdinal;
private String fieldDelimRegex;
private boolean identifyWithFilePrefix;
private Entity entity;
private int filePrefixLength;
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context)
*/
protected void setup(Context context) throws IOException, InterruptedException {
bucketCount = context.getConfiguration().getInt("dts.bucket.count", 1000);
fieldDelimRegex = context.getConfiguration().get("field.delim.regex", "\\[\\]");
identifyWithFilePrefix = context.getConfiguration().getBoolean("dts.identify.with.file.prefix", false);
if (identifyWithFilePrefix) {
filePrefixLength = Integer.parseInt(context.getConfiguration().get("dts.file.prefix.length"));
}
Configuration conf = context.getConfiguration();
String filePath = conf.get("dts.schema.file.path");
FileSystem dfs = FileSystem.get(conf);
Path src = new Path(filePath);
FSDataInputStream fs = dfs.open(src);
ObjectMapper mapper = new ObjectMapper();
schema = mapper.readValue(fs, MixedTypeSchema.class);
}
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Mapper#cleanup(org.apache.hadoop.mapreduce.Mapper.Context)
*/
protected void cleanup(Context context) throws IOException, InterruptedException {
}
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
*/
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] items = value.toString().split(fieldDelimRegex);
if (null == entity) {
if (identifyWithFilePrefix) {
FileSplit fileInpSplit = (FileSplit)context.getInputSplit();
String filePrefix = fileInpSplit.getPath().getName().substring(0, filePrefixLength);
entity = schema.getEntityByFilePrefix(filePrefix);
} else {
entity = schema.getEntityBySize(items.length);
}
idOrdinal = entity.getIdField().getOrdinal();
}
if (null != entity){
hash = items[idOrdinal].hashCode() % bucketCount;
hash = hash < 0 ? -hash : hash;
if (entity.getType() == 0){
if (identifyWithFilePrefix) {
valueHolder.set ( "0," + value.toString());
} else {
valueHolder.set(value);
}
for (int i = 0; i < bucketCount; ++i) {
keyHolder.set((hash * bucketCount + i) * 10);
context.write(keyHolder, valueHolder);
}
} else {
if (identifyWithFilePrefix) {
valueHolder.set ( "1," + value.toString());
} else {
valueHolder.set(value);
}
for (int i = 0; i < bucketCount; ++i) {
keyHolder.set(((i * bucketCount + hash ) * 10) + 1);
context.write(keyHolder, valueHolder);
}
}
} else {
}
}
}
/**
* @author pranab
*
*/
public static class SimilarityReducer extends Reducer<LongWritable, Text, NullWritable, Text> {
private Text valueHolder = new Text();
private MixedTypeSchema schema;
private int firstTypeSize;
private List<String> firstTypeValues = new ArrayList<String>();
private int firstIdOrdinal;
private int secondIdOrdinal;
private String firstId;
private String secondId;
private int firstClassAttrOrdinal = -1;
private int secondClassAttrOrdinal = -1;
private String firstClassAttr;
private String secondClassAttr;
private int sim;
private List<Field> fields;
private List<Field> targetFields;
private int scale;
private Map<Integer, MappedValue> mappedFields = new HashMap<Integer, MappedValue>();
private static final int INVALID_ORDINAL = -1;
private int srcCount;
private int targetCount;
private int simCount;
private int simResultCnt;
private boolean prntDetail;
private DistanceStrategy distStrategy;
private String fieldDelimRegex;
private String fieldDelim;
private DynamicAttrSimilarityStrategy textSimStrategy;
private boolean outputVerbose;
private boolean identifyWithFilePrefix;
private boolean firstType;
private String valueSt;
private String[] items;
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Reducer#setup(org.apache.hadoop.mapreduce.Reducer.Context)
*/
protected void setup(Context context) throws IOException, InterruptedException {
//load schema
Configuration conf = context.getConfiguration();
String filePath = conf.get("dts.schema.file.path");
FileSystem dfs = FileSystem.get(conf);
Path src = new Path(filePath);
FSDataInputStream fs = dfs.open(src);
ObjectMapper mapper = new ObjectMapper();
schema = mapper.readValue(fs, MixedTypeSchema.class);
firstTypeSize = schema.getEntityByType(0).getFieldCount();
firstIdOrdinal = schema.getEntityByType(0).getIdField().getOrdinal();
secondIdOrdinal = schema.getEntityByType(1).getIdField().getOrdinal();
Field field = schema.getEntityByType(0).getClassAttributeField();
if (null != field) {
firstClassAttrOrdinal = field.getOrdinal();
secondClassAttrOrdinal = schema.getEntityByType(0).getClassAttributeField().getOrdinal();
}
fields = schema.getEntityByType(0).getFields();
targetFields = schema.getEntityByType(1).getFields();
scale = context.getConfiguration().getInt("dts.distance.scale", 1000);
distStrategy = schema.createDistanceStrategy(scale);
fieldDelimRegex = context.getConfiguration().get("field.delim.regex", "\\[\\]");
fieldDelim = context.getConfiguration().get("field.delim", ",");
textSimStrategy = schema.createTextSimilarityStrategy();
outputVerbose = context.getConfiguration().getBoolean("dts.sim.output.verbose", true);
identifyWithFilePrefix = context.getConfiguration().getBoolean("dts.identify.with.file.prefix", false);
System.out.println("firstTypeSize: " + firstTypeSize + " firstIdOrdinal:" +firstIdOrdinal +
" secondIdOrdinal:" + secondIdOrdinal + " Source field count:" + fields.size() +
" Target field count:" + targetFields.size());
}
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
*/
protected void reduce(LongWritable key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
firstTypeValues.clear();
srcCount = 0;
targetCount = 0;
simCount = 0;
StringBuilder stBld = new StringBuilder();
for (Text value : values){
String[] items = value.toString().split(fieldDelimRegex);
if ( identifyWithFilePrefix) {
firstType = value.toString().startsWith("0");
valueSt = value.toString().substring(2, value.toString().length());
} else {
firstType = items.length == firstTypeSize;
valueSt = value.toString();
}
if (firstType){
firstTypeValues.add(valueSt);
++srcCount;
} else {
String second = valueSt;
items = second.split(fieldDelimRegex);
secondId = items[secondIdOrdinal];
if (secondClassAttrOrdinal >= 0) {
secondClassAttr = items[secondClassAttrOrdinal];
}
for (String first : firstTypeValues){
//prntDetail = ++simResultCnt % 10000 == 0;
sim = findSimilarity(first, second, context);
items = first.split(fieldDelimRegex);
firstId = items[firstIdOrdinal];
if (firstClassAttrOrdinal >= 0) {
firstClassAttr = items[firstClassAttrOrdinal];
}
if (outputVerbose) {
if (firstClassAttrOrdinal > 0) {
stBld.append(firstId).append(fieldDelim).append(firstClassAttr).append(fieldDelim).append(secondClassAttr).
append(second).append(fieldDelim).append(sim);
} else {
stBld.append(firstId).append(fieldDelim).append(second).append(fieldDelim).append(sim);
}
valueHolder.set(stBld.toString());
} else {
if (firstClassAttrOrdinal > 0) {
stBld.append(firstId).append(fieldDelim).append(secondId).append(fieldDelim).append(firstClassAttr).
append(fieldDelim).append(secondClassAttr).append(sim);
} else {
stBld.append(firstId).append(fieldDelim).append(secondId).append(fieldDelim).append(sim);
}
valueHolder.set(stBld.toString());
}
context.write(NullWritable.get(), valueHolder);
++simCount;
}
++targetCount;
}
}
context.getCounter("Data", "Source Count").increment(srcCount);
context.getCounter("Data", "Target Count").increment(targetCount);
context.getCounter("Data", "Similarity Count").increment(simCount);
}
/**
* Gets distance between two entities
* @param source
* @param target
* @param context
* @return
* @throws IOException
*/
private int findSimilarity(String source, String target, Context context) throws IOException {
int sim = 0;
mapFields(source, context);
String[] trgItems = target.split(fieldDelimRegex);
double dist = 0;
context.getCounter("Data", "Target Field Count").increment(targetFields.size());
if (prntDetail){
System.out.println("target record: " + trgItems[0]);
}
distStrategy.initialize();
for (Field field : targetFields) {
dist = 0;
Integer ordinal = field.getOrdinal();
String trgItem = trgItems[ordinal];
boolean skipAttr = false;
if (prntDetail){
System.out.println("ordinal: " + ordinal + " target:" + trgItem);
}
MappedValue mappedValueObj = mappedFields.get(ordinal);
if (null == mappedValueObj){
//non mapped passive attributes
continue;
}
List<String> mappedValues = mappedValueObj.getValues();
Field srcField = mappedValueObj.getField();
if (!trgItem.isEmpty()) {
if (field.getDataType().equals("categorical")) {
if (!mappedValues.isEmpty()) {
double thisDist;
dist = 1.0;
for (String mappedValue : mappedValues) {
thisDist = schema.findCattegoricalDistance(mappedValue, trgItem, ordinal);
if (thisDist < dist) {
dist = thisDist;
}
if (prntDetail){
System.out.println("dist calculation: ordinal: " + ordinal + " src:" + mappedValue + " target:" + trgItem +
" dist:" + dist);
}
}
context.getCounter("Data", "Dist Calculated").increment(1);
} else {
//missing source
if (schema.getMissingValueHandler().equals("default")){
dist = getDistForMissingSrc(field, trgItem);
} else {
skipAttr = true;
}
context.getCounter("Data", "Missing Source").increment(1);
}
} else if (field.getDataType().equals("int")) {
if (!mappedValues.isEmpty()) {
int trgItemInt = Integer.parseInt(trgItem);
int srcItemInt = getAverageMappedValue(mappedValues);
dist = getDistForNumeric(srcField, srcItemInt, field, trgItemInt);
} else {
//missing source
if (schema.getMissingValueHandler().equals("default")){
dist = getDistForMissingSrc(field, trgItem);
} else {
skipAttr = true;
}
}
} else if (field.getDataType().equals("text")) {
if (!mappedValues.isEmpty()) {
String trgItemTxt = trgItem;
String srcItemTxt = mappedValues.get(0);
dist = textSimStrategy.findDistance(trgItemTxt, srcItemTxt);
} else {
//missing source
if (schema.getMissingValueHandler().equals("default")){
dist = getDistForMissingSrc(field, trgItem);
} else {
skipAttr = true;
}
}
} else if (field.getDataType().equals("location")) {
if (!mappedValues.isEmpty()) {
String trgItemTxt = trgItem;
String srcItemTxt = mappedValues.get(0);
dist = getDistForLocation(trgItemTxt, srcItemTxt, field);
} else {
//missing source
skipAttr = true;
}
}
} else {
//missing target value
if (schema.getMissingValueHandler().equals("default")){
context.getCounter("Data", "Missing Target").increment(1);
dist = getDistForMissingTrg(field, mappedValues);
} else {
skipAttr = true;
}
}
if (!skipAttr) {
distStrategy.accumulate(dist, field);
}
}
sim = distStrategy.getSimilarity();
return sim;
}
/**
* Gets distance between numerial values
* @param srcField
* @param srcVal
* @param trgField
* @param trgVal
* @return
*/
private double getDistForNumeric(Field srcField, int srcVal, Field trgField, int trgVal){
double dist = 0;
boolean linear = false;
String distFun = srcField.getNumDistFunction();
if (distFun.equals("equalSoft")) {
linear = true;
} else if (distFun.equals("equalHard")) {
dist = srcVal == trgVal ? 0 : 1;
} else if (distFun.equals("minSoft")) {
if (trgVal >= srcVal) {
dist = 0;
} else {
linear = true;
}
} else if (distFun.equals("minHard")) {
dist = trgVal >= srcVal ? 0 : 1;
} else if (distFun.equals("maxSoft")) {
if (trgVal <= srcVal) {
dist = 0;
} else {
linear = true;
}
} else if (distFun.equals("maxHard")) {
dist = trgVal <= srcVal ? 0 : 1;
}
if (linear) {
if (trgField.getMax() > trgField.getMin()) {
dist = ((double)(srcVal - trgVal)) / (trgField.getMax() - trgField.getMin());
} else {
int max = srcVal > trgVal ? srcVal : trgVal;
double diff = ((double)(srcVal - trgVal)) / max;
if (diff < 0) {
diff = - diff;
}
dist = diff > schema.getNumericDiffThreshold() ? 1.0 : 0.0;
}
if (dist < 0) {
dist = -dist;
}
}
return dist;
}
/**
* Gets distance between geo location values
* @param trgItemTxt
* @param srcItemTxt
* @param field
* @return
*/
private double getDistForLocation(String trgItemTxt, String srcItemTxt, Field field ) {
double dist = org.sifarish.util.Utility.getGeoDistance(trgItemTxt, srcItemTxt);
dist /= field.getMaxDistance();
dist = dist <= 1.0 ? dist : 1.0;
return dist;
}
/**
* gets distance for missing source field
* @param trgField
* @param trgVal
* @return
*/
private double getDistForMissingSrc(Field trgField, String trgVal){
double dist = 0;
if (trgField.getDataType().equals("categorical") || trgField.getDataType().equals("text")) {
dist = 0;
} else if (trgField.getDataType().equals("int")) {
int trgValInt = Integer.parseInt(trgVal);
int max = trgField.getMax();
int min = trgField.getMin();
if (max > trgField.getMin()) {
double upper = ((double)(max - trgValInt)) / (max - min);
double lower = ((double)(trgValInt - min)) / (max - min);
dist = upper > lower ? upper : lower;
} else {
dist = 1;
}
}
return dist;
}
/** Gets distance when source attribute is missing
* @param trgField
* @param mappedValues
* @return
*/
private double getDistForMissingTrg(Field trgField, List<String> mappedValues){
double dist = 0;
if (trgField.getDataType().equals("categorical") || trgField.getDataType().equals("text")) {
dist = 1;
} else if (trgField.getDataType().equals("int")) {
int srcValInt = getAverageMappedValue(mappedValues);
int max = trgField.getMax();
int min = trgField.getMin();
if (max > min) {
double upper = ((double)(max - srcValInt)) / (max - min);
double lower = ((double)(srcValInt - min)) / (max - min);
dist = upper > lower ? upper : lower;
} else {
dist = 1;
}
}
return dist;
}
/**
* Gets values for the mapped attributes in src mapped from the target
* @param source
* @param context
*/
private void mapFields(String source, Context context){
mappedFields.clear();
String[] srcItems = source.split(fieldDelimRegex);
if (prntDetail){
System.out.println("src record: " + srcItems[0]);
}
for (Field field : fields) {
List<FieldMapping> mappings = field.getMappings();
if (null != mappings){
for (FieldMapping fldMapping : mappings) {
int matchingOrdinal = fldMapping.getMatchingOrdinal();
if (-1 == matchingOrdinal) {
continue;
}
MappedValue mappedValue = mappedFields.get(matchingOrdinal);
if (null == mappedValue){
mappedValue = new MappedValue();
mappedValue.setField(field);
mappedFields.put(matchingOrdinal, mappedValue);
}
List<String> mappedValues = mappedValue.getValues();
String value = srcItems[field.getOrdinal()];
if (prntDetail){
System.out.println("src value: " + value);
}
List<FieldMapping.ValueMapping> valueMappings = fldMapping.getValueMappings();
if (null != valueMappings) {
for (FieldMapping.ValueMapping valMapping : valueMappings) {
if (field.getDataType().equals("categorical")) {
//store mapped values
if (valMapping.getThisValue().equals(value)) {
mappedValues.add(valMapping.getThatValue());
context.getCounter("Data", "Mapped Value").increment(1);
if (prntDetail){
System.out.println("mapped: " + value + " " + valMapping.getThatValue() +
" matching ordinal:" + matchingOrdinal);
}
break;
}
} else if (field.getDataType().equals("int")) {
int valueInt = Integer.parseInt(value);
int[] range = valMapping.getThisValueRange();
if (null != range) {
if (valueInt >= range[0] && valueInt <= range[1]) {
mappedValues.add(valMapping.getThatValue());
break;
}
}
}
}
} else {
if (prntDetail){
System.out.println("non mapped: " + value + " matching ordinal:" + matchingOrdinal);
}
if (!value.isEmpty()) {
mappedValues.add(value);
}
}
}
}
}
}
/**
* @param mappedValues
* @return
*/
private int getAverageMappedValue(List<String> mappedValues){
int sum = 0;
int count = 0;
for (String mappedValue : mappedValues) {
sum += Integer.parseInt(mappedValue);
++count;
}
int fItemInt = sum / count;
return fItemInt;
}
}
/**
* @author pranab
*
*/
public static class IdPairPartitioner extends Partitioner<LongWritable, Text> {
@Override
public int getPartition(LongWritable key, Text value, int numPartitions) {
//consider only base part of key
int keyVal = (int)(key.get() / 10);
return keyVal % numPartitions;
}
}
/**
* @author pranab
*
*/
public static class IdPairGroupComprator extends WritableComparator {
private static final int KEY_EXTENSION_SCALE = 10;
protected IdPairGroupComprator() {
super(LongWritable.class, true);
}
@Override
public int compare(WritableComparable w1, WritableComparable w2) {
//consider only the base part of the key
Long t1 = ((LongWritable)w1).get() / KEY_EXTENSION_SCALE;
Long t2 = ((LongWritable)w2).get() / KEY_EXTENSION_SCALE;
int comp = t1.compareTo(t2);
return comp;
}
}
/**
* @author pranab
*
*/
public static class MappedValue {
private List<String> values = new ArrayList<String>();
private Field field;
public List<String> getValues() {
return values;
}
public void setValues(List<String> values) {
this.values = values;
}
public Field getField() {
return field;
}
public void setField(Field field) {
this.field = field;
}
}
}