package com.skp.experiment.common; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import java.util.Random; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.ToolRunner; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.math.RandomAccessSparseVector; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; import org.apache.mahout.math.map.OpenIntObjectHashMap; import com.skp.experiment.common.mapreduce.ReferenceMapper; import com.skp.experiment.common.parameter.DefaultOptionCreator; public class IdentityJob extends AbstractJob { public static void main(String[] args) throws Exception { ToolRunner.run(new IdentityJob(), args); } @Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption("refer", null, "reference table"); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } //OpenIntObjectHashMap<Vector> testCases = createTestData(new Path(getOption("refer"))); int i = 0; for (String s : getOption("refer").split(DefaultOptionCreator.COMMA_DELIMETER)) { i++; Map<IntWritable, VectorWritable> testCases = createTestData(new Path(s)); for (Entry<IntWritable, VectorWritable> v : testCases.entrySet()) { System.out.println(i + "\t" + v.getKey() + "\t" + v.getValue()); } } Job job = prepareJob(getInputPath(), getOutputPath("time-" + System.currentTimeMillis()), TextInputFormat.class, MyMapper.class, NullWritable.class, Text.class, TextOutputFormat.class); job.getConfiguration().set(ReferenceMapper.REFERENCE_PATHS, getOption("refer")); job.waitForCompletion(true); return 0; } private Map<IntWritable, VectorWritable> createTestData(Path referencePath) throws IOException { FileSystem fs = FileSystem.get(getConf()); SequenceFile.Writer writer = null; Map<IntWritable, VectorWritable> ret = new HashMap<IntWritable, VectorWritable>(); try { writer = SequenceFile.createWriter(fs, getConf(), referencePath, IntWritable.class, VectorWritable.class); Random random = new Random(); for (int i = 0; i < 10; i++) { Vector v = new RandomAccessSparseVector(10); for (int j = 0; j < 10; j++) { v.set(j, random.nextDouble() * 10); } ret.put(new IntWritable(i), new VectorWritable(v)); writer.append(new IntWritable(i), new VectorWritable(v)); } } finally { IOUtils.closeStream(writer); } return ret; } public static class MyMapper extends ReferenceMapper<LongWritable, Text, NullWritable, Text, IntWritable, VectorWritable> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { } } }