package com.github.projectflink.hadoop; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.Arrays; import java.util.StringTokenizer; public class PageRankDriver { private static final Log LOG = LogFactory.getLog(PageRankDriver.class); private static final double DAMPENING_FACTOR = 0.85; private static double RANDOM_JUMP; public static class Message implements Writable { public double prob; public long [] neighbors; public Message() { } public Message(double prob, long[] neighbors) { this.prob = prob; this.neighbors = neighbors; } public Message(double prob) { this.prob = prob; this.neighbors = null; } public int numNeighbors () { return neighbors.length; } @Override public void write(DataOutput dataOutput) throws IOException { dataOutput.writeDouble(prob); if (neighbors == null) { dataOutput.writeBoolean(false); } else { dataOutput.writeBoolean(true); dataOutput.writeInt(neighbors.length); for (int i = 0; i < neighbors.length; i++) { dataOutput.writeLong(neighbors[i]); } } } @Override public void readFields(DataInput dataInput) throws IOException { this.prob = dataInput.readDouble(); boolean hasNeighbors = dataInput.readBoolean(); if (hasNeighbors) { int l = dataInput.readInt(); this.neighbors = new long[l]; for (int i = 0; i < l; i++) { this.neighbors[i] = dataInput.readLong(); } } else { this.neighbors = null; } } } public static class PageRankMapper extends Mapper<LongWritable, Message, LongWritable, Message> { private Message m = new Message(-1.0, null); private LongWritable nid = new LongWritable(); private double randomJump; private double dampeningFactor; @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); randomJump = Double.parseDouble(context.getConfiguration().get("random_jump")); dampeningFactor = Double.parseDouble(context.getConfiguration().get("dampening_factor")); } @Override protected void map(LongWritable key, Message value, Context context) throws IOException, InterruptedException { int n = value.numNeighbors(); double p = value.prob / n; context.write (key, value); for (int i = 0; i < n; i++) { nid.set(value.neighbors[i]); m.prob = p * dampeningFactor; context.write(nid, m); } m.prob = (1.0 - dampeningFactor) * randomJump; context.write (key, m); } } public static class PageRankReducer extends Reducer<LongWritable, Message, LongWritable, Message> { private Message out = new Message(); @Override protected void reduce(LongWritable key, Iterable<Message> values, Context context) throws IOException, InterruptedException { double rank = 0.0; for (Message m : values) { if (m.neighbors != null) { out.neighbors = Arrays.copyOf(m.neighbors, m.neighbors.length); } else { rank += m.prob; } } out.prob = rank; context.write(key, out); } } public static class InitialRankAssigner extends Mapper<LongWritable, Text, LongWritable, Message> { private double randomJump; private Message outValue = new Message(); private LongWritable outKey = new LongWritable(); @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); randomJump = Double.parseDouble(context.getConfiguration().get("random_jump")); // LOG.info("memory = "+Runtime.getRuntime().maxMemory()); } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // LOG.info("len="+value.getLength()); String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line, " "); int count = tokenizer.countTokens(); //LOG.info("Number of tokens for line " + line + " is " + count); outKey.set(Long.valueOf(tokenizer.nextToken())); long [] neighbors = new long[count-1]; for (int i = 0; i < neighbors.length; i++) { neighbors[i] = Long.valueOf(tokenizer.nextToken()); } outValue.neighbors = neighbors; outValue.prob = randomJump; context.write(outKey, outValue); } } public static class RankPrinter extends Mapper<LongWritable, Message, Text, Text> { Text text = new Text(); Text empty = new Text(); @Override protected void map(LongWritable key, Message value, Context context) throws IOException, InterruptedException { String out = key.get() + " " + value.prob; text.set(out); context.write(text, empty); } } public static void assignInitialRanks (Configuration conf, FileSystem fs, String adjacencyPath, String initialPath, int numVertices) throws Exception { Path seqFile = new Path (initialPath); if (fs.exists(seqFile)) { fs.delete(seqFile, true); } Job job = Job.getInstance(conf); job.setJarByClass(InitialRankAssigner.class); job.setMapperClass(InitialRankAssigner.class); job.setReducerClass(Reducer.class); job.setNumReduceTasks(0); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Message.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Message.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.addInputPath(job, new Path(adjacencyPath)); FileOutputFormat.setOutputPath(job, seqFile); job.waitForCompletion(true); } public static void calculateNextRanks (Configuration conf, FileSystem fs, String inputPath, String outputPath) throws Exception { Path outFile = new Path (outputPath); if (fs.exists(outFile)) { fs.delete(outFile, true); } Job job = Job.getInstance(conf); job.setJarByClass(PageRankMapper.class); job.setMapperClass(PageRankMapper.class); job.setReducerClass(PageRankReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Message.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Message.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, outFile); job.waitForCompletion(true); } public static void printFinalRanks (Configuration conf, FileSystem fs, String inputPath, String outputPath) throws Exception { Path outFile = new Path (outputPath); if (fs.exists(outFile)) { fs.delete(outFile, true); } Job job = Job.getInstance(conf); job.setMapperClass(RankPrinter.class); job.setReducerClass(Reducer.class); job.setNumReduceTasks(0); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setJarByClass(RankPrinter.class); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, outFile); job.waitForCompletion(true); } public static void main (String [] args) throws Exception { GenericOptionsParser parser = new GenericOptionsParser(args); String[] remArgs = parser.getRemainingArgs(); String adjacencyFile = remArgs[0]; String resultFile = remArgs[1]; int numVertices = Integer.valueOf(remArgs[2]); int numIterations = Integer.valueOf(remArgs[3]); Configuration conf = parser.getConfiguration(); RANDOM_JUMP = 1.0 / ((double) numVertices); conf.set("random_jump", String.valueOf(RANDOM_JUMP)); conf.set("dampening_factor", String.valueOf(DAMPENING_FACTOR)); FileSystem fs = FileSystem.get(conf); String adjacencySeq = adjacencyFile + "_seq"; assignInitialRanks(conf, fs, adjacencyFile, adjacencySeq, numVertices); String inputFile = adjacencySeq; String outputFile = null; for (int iteration = 0; iteration < numIterations; iteration++) { outputFile = "/pageranks_iteration_" + iteration; calculateNextRanks(conf, fs, inputFile, outputFile); inputFile = outputFile; } printFinalRanks(conf, fs, outputFile, resultFile); } }