/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.clustering.spectral.common; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.mahout.clustering.spectral.eigencuts.EigencutsKeys; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.math.DenseVector; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; /** * Given a matrix, this job returns a vector whose i_th element is the * sum of all the elements in the i_th row of the original matrix. */ public final class MatrixDiagonalizeJob { private MatrixDiagonalizeJob() { } public static Vector runJob(Path affInput, int dimensions) throws IOException, ClassNotFoundException, InterruptedException { // set up all the job tasks Configuration conf = new Configuration(); Path diagOutput = new Path(affInput.getParent(), "diagonal"); HadoopUtil.delete(conf, diagOutput); conf.setInt(EigencutsKeys.AFFINITY_DIMENSIONS, dimensions); Job job = new Job(conf, "MatrixDiagonalizeJob"); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(IntDoublePairWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(VectorWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(MatrixDiagonalizeMapper.class); job.setReducerClass(MatrixDiagonalizeReducer.class); FileInputFormat.addInputPath(job, affInput); FileOutputFormat.setOutputPath(job, diagOutput); job.setJarByClass(MatrixDiagonalizeJob.class); job.waitForCompletion(true); // read the results back from the path return VectorCache.load(conf, new Path(diagOutput, "part-r-00000")); } public static class MatrixDiagonalizeMapper extends Mapper<IntWritable, VectorWritable, NullWritable, IntDoublePairWritable> { @Override protected void map(IntWritable key, VectorWritable row, Context context) throws IOException, InterruptedException { // store the sum IntDoublePairWritable store = new IntDoublePairWritable(key.get(), row.get().zSum()); context.write(NullWritable.get(), store); } } public static class MatrixDiagonalizeReducer extends Reducer<NullWritable, IntDoublePairWritable, NullWritable, VectorWritable> { @Override protected void reduce(NullWritable key, Iterable<IntDoublePairWritable> values, Context context) throws IOException, InterruptedException { // create the return vector Vector retval = new DenseVector(context.getConfiguration().getInt( EigencutsKeys.AFFINITY_DIMENSIONS, Integer.MAX_VALUE)); // put everything in its correct spot for (IntDoublePairWritable e : values) { retval.setQuick(e.getKey(), e.getValue()); } // write it out context.write(key, new VectorWritable(retval)); } } }