/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.graph;
import com.google.common.base.Preconditions;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.FileLineIterable;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
import org.apache.mahout.common.mapreduce.VectorSumReducer;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.map.OpenIntIntHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
import java.util.Map;
import java.util.regex.Pattern;
/**
* <p>Distributed computation of the adjacency matrix of a graph, see http://en.wikipedia.org/wiki/Adjacency_matrix
*
* <p>This job outputs {@link org.apache.hadoop.io.SequenceFile}s an {@link IntWritable} as key and a {@link VectorWritable} as value</p>
*
* <p>Command line arguments specific to this class are:</p>
*
* <ol>
* <li>--output=(path): output path where the resulting matrix and the number of vertices should be written</li>
* <li>--vertices=(path): file containing a list of all vertices</li>
* <li>--edges=(path): Directory containing edges of the graph</li>
* <li>--symmetric = (boolean) produce a symmetric adjacency matrix (corresponds to an undirected graph)</li>
* </ol>
*
* <p>General command line options are documented in {@link AbstractJob}.</p>
*
* <p>Note that because of how Hadoop parses arguments, all "-D" arguments must appear before all other arguments.</p>
*/
public class AdjacencyMatrixJob extends AbstractJob {
private static final Logger log = LoggerFactory.getLogger(AdjacencyMatrixJob.class);
public static final String NUM_VERTICES = "numVertices.bin";
public static final String ADJACENCY_MATRIX = "adjacencyMatrix";
public static final String VERTEX_INDEX = "vertexIndex";
static final String NUM_VERTICES_PARAM = AdjacencyMatrixJob.class.getName() + ".numVertices";
static final String VERTEX_INDEX_PARAM = AdjacencyMatrixJob.class.getName() + ".vertexIndex";
static final String SYMMETRIC_PARAM = AdjacencyMatrixJob.class.getName() + ".symmetric";
public static void main(String[] args) throws Exception {
ToolRunner.run(new AdjacencyMatrixJob(), args);
}
@Override
public int run(String[] args) throws Exception {
addOption("vertices", null, "a text file containing all vertices of the graph (one per line)", true);
addOption("edges", null, "text files containing the edges of the graph (vertexA,vertexB per line)", true);
addOption("symmetric", null, "produce a symmetric adjacency matrix (corresponds to an undirected graph)",
String.valueOf(false));
addOutputOption();
Map<String, String> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
Path vertices = new Path(parsedArgs.get("--vertices"));
Path edges = new Path(parsedArgs.get("--edges"));
boolean symmetric = Boolean.parseBoolean(parsedArgs.get("--symmetric"));
log.info("Indexing vertices sequentially, this might take a while...");
int numVertices = indexVertices(vertices, getOutputPath(VERTEX_INDEX));
HadoopUtil.writeInt(numVertices, getOutputPath(NUM_VERTICES), getConf());
Preconditions.checkArgument(numVertices > 0);
log.info("Found " + numVertices + " vertices, creating adjacency matrix...");
Job createAdjacencyMatrix = prepareJob(edges, getOutputPath(ADJACENCY_MATRIX), TextInputFormat.class,
VectorizeEdgesMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
createAdjacencyMatrix.setCombinerClass(VectorSumReducer.class);
Configuration createAdjacencyMatrixConf = createAdjacencyMatrix.getConfiguration();
createAdjacencyMatrixConf.set(NUM_VERTICES_PARAM, String.valueOf(numVertices));
createAdjacencyMatrixConf.set(VERTEX_INDEX_PARAM, getOutputPath(VERTEX_INDEX).toString());
createAdjacencyMatrixConf.setBoolean(SYMMETRIC_PARAM, symmetric);
createAdjacencyMatrix.waitForCompletion(true);
return 0;
}
//TODO do this in parallel?
private int indexVertices(Path verticesPath, Path indexPath) throws IOException {
FileSystem fs = FileSystem.get(verticesPath.toUri(), getConf());
SequenceFile.Writer writer = null;
int index = 0;
try {
writer = SequenceFile.createWriter(fs, getConf(), indexPath, IntWritable.class, IntWritable.class);
for (FileStatus fileStatus : fs.listStatus(verticesPath)) {
InputStream in = null;
try {
in = HadoopUtil.openStream(fileStatus.getPath(), getConf());
for (String line : new FileLineIterable(in)) {
writer.append(new IntWritable(index++), new IntWritable(Integer.parseInt(line)));
}
} finally {
Closeables.closeQuietly(in);
}
}
} finally {
Closeables.closeQuietly(writer);
}
return index;
}
static class VectorizeEdgesMapper extends Mapper<LongWritable,Text,IntWritable,VectorWritable> {
private int numVertices;
private OpenIntIntHashMap vertexIDsToIndex;
private boolean symmetric;
private final IntWritable row = new IntWritable();
private static final Pattern SEPARATOR = Pattern.compile("[\t,]");
@Override
protected void setup(Context ctx) throws IOException, InterruptedException {
Configuration conf = ctx.getConfiguration();
numVertices = Integer.parseInt(conf.get(NUM_VERTICES_PARAM));
symmetric = conf.getBoolean(SYMMETRIC_PARAM, false);
Path vertexIndexPath = new Path(conf.get(VERTEX_INDEX_PARAM));
vertexIDsToIndex = new OpenIntIntHashMap(numVertices);
for (Pair<IntWritable,IntWritable> indexAndVertexID :
new SequenceFileIterable<IntWritable,IntWritable>(vertexIndexPath, true, conf)) {
vertexIDsToIndex.put(indexAndVertexID.getSecond().get(), indexAndVertexID.getFirst().get());
}
}
@Override
protected void map(LongWritable offset, Text line, Mapper.Context ctx)
throws IOException, InterruptedException {
String[] tokens = SEPARATOR.split(line.toString());
int rowIndex = vertexIDsToIndex.get(Integer.parseInt(tokens[0]));
int columnIndex = vertexIDsToIndex.get(Integer.parseInt(tokens[1]));
Vector partialTransitionMatrixRow = new SequentialAccessSparseVector(numVertices, 1);
row.set(rowIndex);
partialTransitionMatrixRow.setQuick(columnIndex, 1);
ctx.write(row, new VectorWritable(partialTransitionMatrixRow));
if (symmetric && rowIndex != columnIndex) {
partialTransitionMatrixRow = new SequentialAccessSparseVector(numVertices, 1);
row.set(columnIndex);
partialTransitionMatrixRow.setQuick(rowIndex, 1);
ctx.write(row, new VectorWritable(partialTransitionMatrixRow));
}
}
}
}