package com.skp.experiment.cf.math.hadoop; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator; import org.apache.mahout.common.mapreduce.MergeVectorsCombiner; import org.apache.mahout.common.mapreduce.MergeVectorsReducer; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; import com.google.common.io.Closeables; import com.skp.experiment.common.mapreduce.AverageVectorMapper; /* * read DistributedRowMatrix(<WritableComparable, VectorWritable> as input * and aggregate row and write <NullWritable, VectorWritable> */ public class MatrixRowAggregateJob extends AbstractJob { private MatrixRowAggregateJob() {} public static Job createAggregateRowJob( Path matrixInputPath, Path outputVectorPathBase) throws IOException { Job job = new Job(new Configuration(), "DistributedRowMatrix aggregate row job"); job.setJarByClass(MatrixRowAggregateJob.class); Configuration conf = job.getConfiguration(); FileSystem fs = FileSystem.get(conf); matrixInputPath = fs.makeQualified(matrixInputPath); outputVectorPathBase = fs.makeQualified(outputVectorPathBase); FileInputFormat.addInputPath(job, matrixInputPath); FileOutputFormat.setOutputPath(job, outputVectorPathBase); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(AverageVectorMapper.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(VectorWritable.class); job.setReducerClass(MergeVectorsReducer.class); job.setCombinerClass(MergeVectorsCombiner.class); return job; } public static Vector retrieveAggregatedRowOutputVector(Job job) throws IOException { Path outputPath = FileOutputFormat.getOutputPath(job); Path outputFile = new Path(outputPath, "part-00000"); SequenceFileValueIterator<VectorWritable> iterator = new SequenceFileValueIterator<VectorWritable>(outputFile, true, job.getConfiguration()); try { return iterator.next().get(); } finally { Closeables.closeQuietly(iterator); } } @Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); if (parseArguments(args) == null) { return -1; } Job job = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class, AverageVectorMapper.class, NullWritable.class, VectorWritable.class, MergeVectorsReducer.class, NullWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); job.setJarByClass(MatrixRowAggregateJob.class); job.setCombinerClass(MergeVectorsCombiner.class); job.waitForCompletion(true); return 0; } }