AdjacencyMatrixJob.java example

Explorer
mahout-rbmClassifier-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.graph;

import com.google.common.base.Preconditions;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.FileLineIterable;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
import org.apache.mahout.common.mapreduce.VectorSumReducer;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.map.OpenIntIntHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.util.Map;
import java.util.regex.Pattern;

/**
 * <p>Distributed computation of the adjacency matrix of a graph, see http://en.wikipedia.org/wiki/Adjacency_matrix
 *
 * <p>This job outputs {@link org.apache.hadoop.io.SequenceFile}s an {@link IntWritable} as key and a {@link VectorWritable}  as value</p>
 *
 * <p>Command line arguments specific to this class are:</p>
 *
 * <ol>
 *   <li>--output=(path): output path where the resulting matrix and the number of vertices should be written</li>
 *   <li>--vertices=(path): file containing a list of all vertices</li>
 *   <li>--edges=(path): Directory containing edges of the graph</li>
 *   <li>--symmetric = (boolean) produce a symmetric adjacency matrix (corresponds to an undirected graph)</li>
 * </ol>
 *
 * <p>General command line options are documented in {@link AbstractJob}.</p>
 *
 * <p>Note that because of how Hadoop parses arguments, all "-D" arguments must appear before all other arguments.</p>
 */
public class AdjacencyMatrixJob extends AbstractJob {

  private static final Logger log = LoggerFactory.getLogger(AdjacencyMatrixJob.class);

  public static final String NUM_VERTICES = "numVertices.bin";
  public static final String ADJACENCY_MATRIX = "adjacencyMatrix";
  public static final String VERTEX_INDEX = "vertexIndex";

  static final String NUM_VERTICES_PARAM = AdjacencyMatrixJob.class.getName() + ".numVertices";
  static final String VERTEX_INDEX_PARAM = AdjacencyMatrixJob.class.getName() + ".vertexIndex";
  static final String SYMMETRIC_PARAM = AdjacencyMatrixJob.class.getName() + ".symmetric";

  public static void main(String[] args) throws Exception {
    ToolRunner.run(new AdjacencyMatrixJob(), args);
  }

  @Override
  public int run(String[] args) throws Exception {

    addOption("vertices", null, "a text file containing all vertices of the graph (one per line)", true);
    addOption("edges", null, "text files containing the edges of the graph (vertexA,vertexB per line)", true);
    addOption("symmetric", null, "produce a symmetric adjacency matrix (corresponds to an undirected graph)",
        String.valueOf(false));

    addOutputOption();

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
      return -1;
    }

    Path vertices = new Path(parsedArgs.get("--vertices"));
    Path edges = new Path(parsedArgs.get("--edges"));
    boolean symmetric = Boolean.parseBoolean(parsedArgs.get("--symmetric"));

    log.info("Indexing vertices sequentially, this might take a while...");
    int numVertices = indexVertices(vertices, getOutputPath(VERTEX_INDEX));

    HadoopUtil.writeInt(numVertices, getOutputPath(NUM_VERTICES), getConf());
    Preconditions.checkArgument(numVertices > 0);

    log.info("Found " + numVertices + " vertices, creating adjacency matrix...");
    Job createAdjacencyMatrix = prepareJob(edges, getOutputPath(ADJACENCY_MATRIX), TextInputFormat.class,
        VectorizeEdgesMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
        IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    createAdjacencyMatrix.setCombinerClass(VectorSumReducer.class);
    Configuration createAdjacencyMatrixConf = createAdjacencyMatrix.getConfiguration();
    createAdjacencyMatrixConf.set(NUM_VERTICES_PARAM, String.valueOf(numVertices));
    createAdjacencyMatrixConf.set(VERTEX_INDEX_PARAM, getOutputPath(VERTEX_INDEX).toString());
    createAdjacencyMatrixConf.setBoolean(SYMMETRIC_PARAM, symmetric);
    createAdjacencyMatrix.waitForCompletion(true);

    return 0;
  }

  //TODO do this in parallel?
  private int indexVertices(Path verticesPath, Path indexPath) throws IOException {
    FileSystem fs = FileSystem.get(verticesPath.toUri(), getConf());
    SequenceFile.Writer writer = null;
    int index = 0;

    try {
      writer = SequenceFile.createWriter(fs, getConf(), indexPath, IntWritable.class, IntWritable.class);

      for (FileStatus fileStatus : fs.listStatus(verticesPath)) {
        InputStream in = null;
        try {
          in = HadoopUtil.openStream(fileStatus.getPath(), getConf());
          for (String line : new FileLineIterable(in)) {
            writer.append(new IntWritable(index++), new IntWritable(Integer.parseInt(line)));
          }
        } finally {
          Closeables.closeQuietly(in);
        }
      }
    } finally {
      Closeables.closeQuietly(writer);
    }

    return index;
  }

  static class VectorizeEdgesMapper extends Mapper<LongWritable,Text,IntWritable,VectorWritable> {

    private int numVertices;
    private OpenIntIntHashMap vertexIDsToIndex;
    private boolean symmetric;

    private final IntWritable row = new IntWritable();

    private static final Pattern SEPARATOR = Pattern.compile("[\t,]");

    @Override
    protected void setup(Context ctx) throws IOException, InterruptedException {
      Configuration conf = ctx.getConfiguration();
      numVertices = Integer.parseInt(conf.get(NUM_VERTICES_PARAM));
      symmetric = conf.getBoolean(SYMMETRIC_PARAM, false);
      Path vertexIndexPath = new Path(conf.get(VERTEX_INDEX_PARAM));
      vertexIDsToIndex = new OpenIntIntHashMap(numVertices);
      for (Pair<IntWritable,IntWritable> indexAndVertexID :
          new SequenceFileIterable<IntWritable,IntWritable>(vertexIndexPath, true, conf)) {
        vertexIDsToIndex.put(indexAndVertexID.getSecond().get(), indexAndVertexID.getFirst().get());
      }
    }

    @Override
    protected void map(LongWritable offset, Text line, Mapper.Context ctx)
        throws IOException, InterruptedException {

      String[] tokens = SEPARATOR.split(line.toString());
      int rowIndex = vertexIDsToIndex.get(Integer.parseInt(tokens[0]));
      int columnIndex = vertexIDsToIndex.get(Integer.parseInt(tokens[1]));

      Vector partialTransitionMatrixRow = new SequentialAccessSparseVector(numVertices, 1);
      row.set(rowIndex);
      partialTransitionMatrixRow.setQuick(columnIndex, 1);
      ctx.write(row, new VectorWritable(partialTransitionMatrixRow));

      if (symmetric && rowIndex != columnIndex) {
        partialTransitionMatrixRow = new SequentialAccessSparseVector(numVertices, 1);
        row.set(columnIndex);
        partialTransitionMatrixRow.setQuick(rowIndex, 1);
        ctx.write(row, new VectorWritable(partialTransitionMatrixRow));
      }
    }
  }

}