package com.skp.experiment.cf.math.hadoop.similarity; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.common.ClassUtils; import org.apache.mahout.common.commandline.DefaultOptionCreator; import org.apache.mahout.common.distance.CosineDistanceMeasure; import org.apache.mahout.common.distance.DistanceMeasure; import org.apache.mahout.common.iterator.sequencefile.PathType; import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator; import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator; import org.apache.mahout.common.mapreduce.MergeVectorsCombiner; import org.apache.mahout.common.mapreduce.MergeVectorsReducer; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; public class MatrixRowVectorSimilarityJob extends AbstractJob { //public static final String INPUT_VECTOR_OUTPUT_PATH = "DistributedMatrix.rowsimilarity"; public static final String INPUT_VECTOR_PATH = ""; public static final String DISTANCE_MEASIRE_KEY = "distance.measure"; /* * input: matrix input, input vector, distance measure */ @Override public int run(String[] args) throws Exception { /* setup input options */ addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.distanceMeasureOption().create()); addOption(INPUT_VECTOR_PATH, "iv", "input vector path."); if (parseArguments(args) == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); Path inputVector = new Path(getOption(INPUT_VECTOR_PATH)); /* setup distance measure class */ String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); if (measureClass == null) { measureClass = CosineDistanceMeasure.class.getName(); } DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); if (getConf() == null) { setConf(new Configuration()); } /* set required parameters into configuration */ Configuration conf = getConf(); conf.set(DISTANCE_MEASIRE_KEY, measure.getClass().getName()); conf.set(INPUT_VECTOR_PATH, inputVector.getName()); /* build job */ Job job = prepareJob(input, output, SequenceFileInputFormat.class, MatrixRowVectorSimilarityMapper.class, NullWritable.class, VectorWritable.class, MergeVectorsReducer.class, NullWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); job.setJarByClass(MatrixRowVectorSimilarityJob.class); job.setCombinerClass(MergeVectorsCombiner.class); job.waitForCompletion(true); return 0; } /* * matrixPath: <IntWritable, VectorWritable> * inputVectorPath: <NullWritable, VectorWritable> */ public static Job createMatrixRowVectorSimilarityJob(Configuration conf, Path matrixPath, Path inputVectorPath, Path outputPath, String measureClass) throws IOException { conf.set(DISTANCE_MEASIRE_KEY, measureClass); conf.set(INPUT_VECTOR_PATH, inputVectorPath.getName()); Job job = new Job(conf, "Matrix row vector similarity job"); job.setJarByClass(MatrixRowVectorSimilarityJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.addInputPath(job, matrixPath); FileOutputFormat.setOutputPath(job, outputPath); job.setMapperClass(MatrixRowVectorSimilarityMapper.class); job.setReducerClass(MergeVectorsReducer.class); job.setCombinerClass(MergeVectorsCombiner.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(VectorWritable.class); return job; } public static Vector retrieveMatrixRowVectorSimilarity(Configuration conf, Path output) throws IOException { SequenceFileValueIterator<VectorWritable> iterator = new SequenceFileValueIterator<VectorWritable>(output, true, conf); return iterator.next().get(); } }