package com.skp.experiment.cf.math.hadoop;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.join.CompositeInputFormat;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.function.Functions;
import org.apache.mahout.math.hadoop.MatrixMultiplicationJob;
import com.skp.experiment.common.HadoopClusterUtil;
public class MatrixMultiplyWithThresholdJob extends
MatrixMultiplicationJob {
private static final String OUT_CARD = "output.vector.cardinality";
private static String THRESHOLD_KEY = "matrix.multiply.threshold";
public static JobConf createMatrixMultiplyWithThresholdJob(Configuration initialConf,
Path aPath,
Path bPath,
Path outPath,
int outCardinality,
float threshold) throws IOException {
JobConf conf = new JobConf(initialConf, MatrixMultiplyWithThresholdJob.class);
conf.setJobName("Matrix Multiply With Threshold Job");
conf.setInputFormat(CompositeInputFormat.class);
conf.set("mapred.join.expr", CompositeInputFormat.compose(
"inner", SequenceFileInputFormat.class, new Path(aPath, "part*"), new Path(bPath, "part*")));
conf.setInt(OUT_CARD, outCardinality);
conf.setFloat(THRESHOLD_KEY, threshold);
conf.setBoolean("mapred.map.tasks.speculative.execution", false);
conf.setInt("io.sort.factor", 100);
conf.setOutputFormat(SequenceFileOutputFormat.class);
FileOutputFormat.setOutputPath(conf, outPath);
conf.setMapperClass(MatrixMultiplyMapper.class);
conf.setCombinerClass(MatrixMultiplicationReducer.class);
conf.setReducerClass(MatrixMultiplyWithThredsholdReducer.class);
conf.setMapOutputKeyClass(IntWritable.class);
conf.setMapOutputValueClass(VectorWritable.class);
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(VectorWritable.class);
return conf;
}
public static class MatrixMultiplyWithThredsholdReducer extends MapReduceBase
implements Reducer<IntWritable, VectorWritable, IntWritable, VectorWritable> {
private static float threshold = 0;
private static int outCardinality;
@Override
public void configure(JobConf conf) {
threshold = conf.getFloat(THRESHOLD_KEY, 0);
outCardinality = conf.getInt(OUT_CARD, Integer.MAX_VALUE);
}
@Override
public void reduce(IntWritable rowNum, Iterator<VectorWritable> it,
OutputCollector<IntWritable, VectorWritable> out, Reporter reporter)
throws IOException {
if (!it.hasNext()) {
return;
}
Vector accumulator = new RandomAccessSparseVector(it.next().get());
while (it.hasNext()) {
Vector row = it.next().get();
accumulator.assign(row, Functions.PLUS);
}
Vector prunedOutVector = new RandomAccessSparseVector(outCardinality, 10);
Iterator<Vector.Element> iter = accumulator.iterateNonZero();
while (iter.hasNext()) {
Vector.Element e = iter.next();
if (e.get() < threshold) {
continue;
}
prunedOutVector.set(e.index(), e.get());
}
out.collect(rowNum, new VectorWritable(prunedOutVector));
}
}
}