package com.skp.experiment.cf.math.hadoop.similarity;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
/*
* assumes input vector is in one file
*/
public class MatrixRowVectorSimilarityMapper extends
Mapper<IntWritable, VectorWritable, NullWritable, VectorWritable> {
private DistanceMeasure measure;
private Vector inputVector;
@Override
protected void map(IntWritable key, VectorWritable value, Context context)
throws IOException,
InterruptedException {
double distance = measure.distance(value.get(), inputVector);
Vector outVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
outVector.set(key.get(), distance);
context.write(NullWritable.get(), new VectorWritable(outVector));
}
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
Configuration conf = context.getConfiguration();
measure =
ClassUtils.instantiateAs(
conf.get(MatrixRowVectorSimilarityJob.DISTANCE_MEASIRE_KEY),
DistanceMeasure.class);
measure.configure(conf);
inputVector = retrieveInputVector(context);
}
private Vector retrieveInputVector(Context context) throws IOException {
Configuration conf = context.getConfiguration();
Path inputVector = new Path(conf.get(MatrixRowVectorSimilarityJob.INPUT_VECTOR_PATH));
SequenceFileValueIterator<VectorWritable> iterator =
new SequenceFileValueIterator<VectorWritable>(inputVector, true, conf);
return iterator.next().get();
}
}