package com.skp.experiment.cf.math.hadoop; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.common.mapreduce.MergeVectorsCombiner; import org.apache.mahout.common.mapreduce.MergeVectorsReducer; import org.apache.mahout.math.RandomAccessSparseVector; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; import com.skp.experiment.common.mapreduce.AverageVectorMapper; public class MatrixRowNormalizeJob extends AbstractJob { @Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); if (parseArguments(args) == null) { return -1; } Job job = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class, MatrixRowNormalizeMapper.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); job.setJarByClass(MatrixRowNormalizeJob.class); job.setNumReduceTasks(0); job.waitForCompletion(true); return 0; } public static Job createMatrixRowNormalizeJob(Path input, Path output, Configuration conf) throws IOException { Job job = new Job(conf, "Matrix Row Normalize Job"); job.setJarByClass(MatrixRowNormalizeJob.class); FileSystem fs = FileSystem.get(conf); input = fs.makeQualified(input); output = fs.makeQualified(output); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(MatrixRowNormalizeMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); job.setNumReduceTasks(0); return job; } public static class MatrixRowNormalizeMapper extends Mapper<IntWritable, VectorWritable, IntWritable, VectorWritable> { @Override protected void map(IntWritable key, VectorWritable value, Context context) throws IOException, InterruptedException { Vector v = value.get(); Vector norm = v.divide(v.zSum()); context.write(key, new VectorWritable(norm)); } } }