package com.skp.experiment.common;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
public class DistributedRowMatrix2TextJob extends AbstractJob {
private static final String OUTPUT_FORMAT = "outFormat";
private static String outputFormat = null;
public static void main(String[] args) throws Exception {
ToolRunner.run(new DistributedRowMatrix2TextJob(), args);
}
@Override
public int run(String[] args) throws Exception {
addInputOption();
addOutputOption();
addOption(OUTPUT_FORMAT, "of", "output format. {raw, vector}.", "vector");
if (parseArguments(args) == null) {
return -1;
}
Path input = getInputPath();
Path output = getOutputPath();
outputFormat = getOption(OUTPUT_FORMAT);
Job job = prepareJob(input, output,
SequenceFileInputFormat.class, DistributedRowMatrix2TextMapper.class,
NullWritable.class, Text.class, TextOutputFormat.class
);
Configuration conf = job.getConfiguration();
job.setJarByClass(DistributedRowMatrix2TextJob.class);
conf.set(OUTPUT_FORMAT, outputFormat);
job.waitForCompletion(true);
return 0;
}
public static class DistributedRowMatrix2TextMapper extends
Mapper<IntWritable, VectorWritable, NullWritable, Text> {
private static Text outValue = new Text();
private static String outputFormat = null;
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
Configuration conf = context.getConfiguration();
if (conf.get(OUTPUT_FORMAT) != null) {
outputFormat = conf.get(OUTPUT_FORMAT);
}
}
@Override
protected void map(IntWritable key, VectorWritable value, Context context)
throws IOException, InterruptedException {
Iterator<Vector.Element> iter = value.get().iterateNonZero();
if (outputFormat == null || outputFormat.equals("vector")) {
StringBuffer sb = new StringBuffer();
sb.append(key.get()).append("\t");
while (iter.hasNext()) {
Vector.Element e = iter.next();
sb.append("\t").append(e.index()).append(":").append(e.get());
}
outValue.set(sb.toString());
context.write(NullWritable.get(), outValue);
} else if (outputFormat.equals("raw")) {
while (iter.hasNext()) {
Vector.Element e = iter.next();
StringBuffer sb = new StringBuffer();
sb.append(key.get()).append(",").append(e.index()).append(",").append(e.get());
outValue.set(sb.toString());
context.write(NullWritable.get(), outValue);
}
}
/*
StringBuffer sb = new StringBuffer();
Iterator<Vector.Element> iter = value.get().iterator();
sb.append(key.get()).append("\t");
while (iter.hasNext()) {
Vector.Element e = iter.next();
sb.append("\t").append(e.index()).append(":").append(e.get());
}
System.out.println(sb.toString());
outValue.set(sb.toString());
context.write(NullWritable.get(), outValue);
*/
}
}
}