/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.clustering.spectral.eigencuts;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.mahout.clustering.spectral.common.VertexWritable;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public final class EigencutsAffinityCutsJob {
private static final Logger log = LoggerFactory.getLogger(EigencutsAffinityCutsJob.class);
private EigencutsAffinityCutsJob() {
}
enum CUTSCOUNTER {
NUM_CUTS
}
/**
* Runs a single iteration of defining cluster boundaries, based on
* previous calculations and the formation of the "cut matrix".
*
* @param currentAffinity Path to the current affinity matrix.
* @param cutMatrix Path to the sensitivity matrix.
* @param nextAffinity Output path for the new affinity matrix.
*/
public static long runjob(Path currentAffinity, Path cutMatrix, Path nextAffinity, Configuration conf)
throws IOException, ClassNotFoundException, InterruptedException {
// these options allow us to differentiate between the two vectors
// in the mapper and reducer - we'll know from the working path
// which SequenceFile we're accessing
conf.set(EigencutsKeys.AFFINITY_PATH, currentAffinity.getName());
conf.set(EigencutsKeys.CUTMATRIX_PATH, cutMatrix.getName());
Job job = new Job(conf, "EigencutsAffinityCutsJob");
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(VertexWritable.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(VectorWritable.class);
job.setMapperClass(EigencutsAffinityCutsMapper.class);
job.setCombinerClass(EigencutsAffinityCutsCombiner.class);
job.setReducerClass(EigencutsAffinityCutsReducer.class);
//FileInputFormat.addInputPath(job, currentAffinity);
FileInputFormat.addInputPath(job, cutMatrix);
FileOutputFormat.setOutputPath(job, nextAffinity);
job.waitForCompletion(true);
return job.getCounters().findCounter(CUTSCOUNTER.NUM_CUTS).getValue();
}
public static class EigencutsAffinityCutsMapper
extends Mapper<IntWritable, VectorWritable, Text, VertexWritable> {
@Override
protected void map(IntWritable key, VectorWritable row, Context context)
throws IOException, InterruptedException {
// all this method does is construct a bunch of vertices, mapping those
// together which have the same *combination* of indices; for example,
// (1, 3) will have the same key as (3, 1) but a different key from (1, 1)
// and (3, 3) (which, incidentally, will also not be grouped together)
String type = context.getWorkingDirectory().getName();
Vector vector = row.get();
for (Vector.Element e : vector) {
String newkey = Math.max(key.get(), e.index()) + "_" + Math.min(key.get(), e.index());
context.write(new Text(newkey), new VertexWritable(key.get(), e.index(), e.get(), type));
}
}
}
public static class EigencutsAffinityCutsCombiner
extends Reducer<Text, VertexWritable, Text, VertexWritable> {
@Override
protected void reduce(Text t, Iterable<VertexWritable> vertices,
Context context) throws IOException, InterruptedException {
// there should be exactly 4 items in the iterable; two from the
// first Path source, and two from the second with matching (i, j) indices
// the idea here is that we want the two vertices of the "cut" matrix,
// and if either of them has a non-zero value, we want to:
//
// 1) zero out the two affinity vertices, and
// 2) add their former values to the (i, i) and (j, j) coordinates
//
// though obviously we want to perform these steps in reverse order
Configuration conf = context.getConfiguration();
log.debug("{}", t);
boolean zero = false;
int i = -1;
int j = -1;
double k = 0;
int count = 0;
for (VertexWritable v : vertices) {
count++;
if (v.getType().equals(conf.get(EigencutsKeys.AFFINITY_PATH))) {
i = v.getRow();
j = v.getCol();
k = v.getValue();
} else if (v.getValue() != 0.0) {
zero = true;
}
}
// if there are only two vertices, we have a diagonal
// we want to preserve whatever is currently in the diagonal,
// since this is acting as a running sum of all other values
// that have been "cut" so far - simply return this element as is
if (count == 2) {
VertexWritable vw = new VertexWritable(i, j, k, "unimportant");
context.write(new Text(String.valueOf(i)), vw);
return;
}
// do we zero out the values?
VertexWritable outI = new VertexWritable();
VertexWritable outJ = new VertexWritable();
if (zero) {
// increment the cut counter
context.getCounter(CUTSCOUNTER.NUM_CUTS).increment(1);
// we want the values to exist on the diagonal
outI.setCol(i);
outJ.setCol(j);
// also, set the old values to zero
VertexWritable zeroI = new VertexWritable();
VertexWritable zeroJ = new VertexWritable();
zeroI.setCol(j);
zeroI.setValue(0);
zeroJ.setCol(i);
zeroJ.setValue(0);
zeroI.setType("unimportant");
zeroJ.setType("unimportant");
context.write(new Text(String.valueOf(i)), zeroI);
context.write(new Text(String.valueOf(j)), zeroJ);
} else {
outI.setCol(j);
outJ.setCol(i);
}
// set the values and write them
outI.setValue(k);
outJ.setValue(k);
outI.setType("unimportant");
outJ.setType("unimportant");
context.write(new Text(String.valueOf(i)), outI);
context.write(new Text(String.valueOf(j)), outJ);
}
}
public static class EigencutsAffinityCutsReducer
extends Reducer<Text, VertexWritable, IntWritable, VectorWritable> {
@Override
protected void reduce(Text row, Iterable<VertexWritable> entries,
Context context) throws IOException, InterruptedException {
// now to assemble the vectors
RandomAccessSparseVector output = new RandomAccessSparseVector(
context.getConfiguration().getInt(EigencutsKeys.AFFINITY_DIMENSIONS, Integer.MAX_VALUE), 100);
int rownum = Integer.parseInt(row.toString());
for (VertexWritable e : entries) {
// first, are we setting a diagonal?
if (e.getCol() == rownum) {
// add to what's already present
output.setQuick(e.getCol(), output.getQuick(e.getCol()) + e.getValue());
} else {
// simply set the value
output.setQuick(e.getCol(), e.getValue());
}
}
context.write(new IntWritable(rownum), new VectorWritable(output));
}
}
}