/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.clustering.spectral.eigencuts;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.mahout.clustering.spectral.common.VectorCache;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
/**
* <p>There are a quite a few operations bundled within this mapper. Gather 'round
* and listen, all of ye.</p>
*
* <p>The input to this job is eight items:</p>
* <ol><li>B<sub>0</sub>, which is a command-line parameter fed through the Configuration object</li>
* <li>diagonal matrix, a constant vector fed through the Hadoop cache</li>
* <li>list of eigenvalues, a constant vector fed through the Hadoop cache</li>
* <li>eigenvector, the input value to the mapper</li>
* <li>epsilon</li>
* <li>delta</li>
* <li>tau</li>
* <li>output, the Path to the output matrix of sensitivities</li></ol>
*
* <p>The first three items are constant and are used in all of the map
* tasks. The row index indicates which eigenvalue from the list to use, and
* also serves as the output identifier. The diagonal matrix and the
* eigenvector are both of equal length and are iterated through twice
* within each map task, unfortunately lending each task to a runtime of
* n<sup>2</sup>. This is unavoidable.</p>
*
* <p>For each (i, j) combination of elements within the eigenvector, a complex
* equation is run that explicitly computes the sensitivity to perturbation of
* the flow of probability within the specific edge of the graph. Each
* sensitivity, as it is computed, is simultaneously applied to a non-maximal
* suppression step: for a given sensitivity S_ij, it must be suppressed if
* any other S_in or S_mj has a more negative value. Thus, only the most
* negative S_ij within its row i or its column j is stored in the return
* array, leading to an output (per eigenvector!) with maximum length n,
* minimum length 1.</p>
*
* <p>Overall, this creates an n-by-n (possibly sparse) matrix with a maximum
* of n^2 non-zero elements, minimum of n non-zero elements.</p>
*/
public final class EigencutsSensitivityJob {
private EigencutsSensitivityJob() {
}
/**
* Initializes the configuration tasks, loads the needed data into
* the HDFS cache, and executes the job.
*
* @param eigenvalues Vector of eigenvalues
* @param diagonal Vector representing the diagonal matrix
* @param eigenvectors Path to the DRM of eigenvectors
* @param output Path to the output matrix (will have between n and full-rank
* non-zero elements)
*/
public static void runJob(Vector eigenvalues,
Vector diagonal,
Path eigenvectors,
double beta,
double tau,
double delta,
double epsilon,
Path output)
throws IOException, ClassNotFoundException, InterruptedException {
// save the two vectors to the distributed cache
Configuration jobConfig = new Configuration();
Path eigenOutputPath = new Path(output.getParent(), "eigenvalues");
Path diagOutputPath = new Path(output.getParent(), "diagonal");
jobConfig.set(EigencutsKeys.VECTOR_CACHE_BASE, output.getParent().getName());
VectorCache.save(new IntWritable(EigencutsKeys.EIGENVALUES_CACHE_INDEX),
eigenvalues, eigenOutputPath, jobConfig);
VectorCache.save(new IntWritable(EigencutsKeys.DIAGONAL_CACHE_INDEX),
diagonal, diagOutputPath, jobConfig);
// set up the rest of the job
jobConfig.set(EigencutsKeys.BETA, Double.toString(beta));
jobConfig.set(EigencutsKeys.EPSILON, Double.toString(epsilon));
jobConfig.set(EigencutsKeys.DELTA, Double.toString(delta));
jobConfig.set(EigencutsKeys.TAU, Double.toString(tau));
Job job = new Job(jobConfig, "EigencutsSensitivityJob");
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(EigencutsSensitivityNode.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(VectorWritable.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setMapperClass(EigencutsSensitivityMapper.class);
job.setReducerClass(EigencutsSensitivityReducer.class);
FileInputFormat.addInputPath(job, eigenvectors);
FileOutputFormat.setOutputPath(job, output);
job.waitForCompletion(true);
}
}