package com.skp.experiment.common; import java.io.IOException; import java.util.Iterator; import java.util.Map; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.ToolRunner; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; /** */ public class DistributedRowMatrixNormalizeJob extends AbstractJob { private static final String NORM_OUTPUT_TYPE = DistributedRowMatrixNormalizeJob.class.getName() + ".normOutputType"; private static final String REVERSE_OPTION = DistributedRowMatrixNormalizeJob.class.getName() + ".reverseOption"; private static final String DELIMETER = ","; public static void main(String[] args) throws Exception { ToolRunner.run(new DistributedRowMatrixNormalizeJob(), args); } @Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption("outputType", "otype", "output type{true if vector, otherwise false}", String.valueOf(true)); addOption("reverse", "r", "true if want to reverse row idx and col idx(transpose).", String.valueOf(false)); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } @SuppressWarnings("rawtypes") Class<? extends WritableComparable> keyClass = getOption("outputType").equals("true") ? IntWritable.class : NullWritable.class; Class<? extends Writable> valueClass = getOption("outputType").equals("true") ? VectorWritable.class : Text.class; @SuppressWarnings("rawtypes") Class<? extends OutputFormat> outFileFormat = getOption("outputType").equals("true") ? SequenceFileOutputFormat.class : TextOutputFormat.class; Job normJob = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class, VectorNormMapper.class, keyClass, valueClass, outFileFormat); normJob.getConfiguration().setBoolean(NORM_OUTPUT_TYPE, getOption("outputType").equals("true") ? true : false); normJob.getConfiguration().setBoolean(REVERSE_OPTION, getOption("reverse").equals("true") ? true : false); normJob.waitForCompletion(true); return 0; } @SuppressWarnings("rawtypes") public static class VectorNormMapper extends Mapper<IntWritable,VectorWritable,WritableComparable,Writable> { private static Text outValue = new Text(); private static boolean vectorOutput = true; private static boolean reverse = false; @Override protected void setup(Context ctx) throws IOException, InterruptedException { vectorOutput = ctx.getConfiguration().getBoolean(NORM_OUTPUT_TYPE, true); reverse = ctx.getConfiguration().getBoolean(REVERSE_OPTION, false); } @Override protected void map(IntWritable row, VectorWritable vectorWritable, Context ctx) throws IOException, InterruptedException { Vector normVector = vectorWritable.get().normalize(); if (vectorOutput) { ctx.write(row, new VectorWritable(normVector)); } else { Iterator<Vector.Element> iter = normVector.iterateNonZero(); while (iter.hasNext()) { Vector.Element e = iter.next(); if (reverse) { outValue.set(e.index() + DELIMETER + row.get() + DELIMETER + e.get()); } else { outValue.set(row.get() + DELIMETER + e.index() + DELIMETER + e.get()); } ctx.write(NullWritable.get(), outValue); } } } } }