package mia.clustering.ch12.lastfm; import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DefaultStringifier; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.GenericsUtil; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.math.VectorWritable; public class VectorCreationJob { private VectorCreationJob() {} public static void createVectors(Path input, Path output, Path dictionaryPath) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = CreateNewConfiguration(); Map<String,Integer> dictionary = new HashMap<String,Integer>(); FileSystem fs = FileSystem.get(dictionaryPath.toUri(), conf); FileStatus[] outputFiles = fs.globStatus(new Path(dictionaryPath, "part-*")); int i = 0; for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(path.toString()+"-dict"), Text.class, IntWritable.class); Text key = new Text(); IntWritable value = new IntWritable(); while (reader.next(key, value)) { dictionary.put(key.toString(), Integer.valueOf(i++)); writer.append(key, new IntWritable(i-1)); } writer.close(); } DefaultStringifier<Map<String,Integer>> mapStringifier = new DefaultStringifier<Map<String,Integer>>( conf, GenericsUtil.getClass(dictionary)); conf.set("dictionary", mapStringifier.toString(dictionary)); Job job = new Job(conf, "Generating dataset based from input" + input); job.setJarByClass(VectorCreationJob.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); HadoopUtil.delete(conf, output); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(VectorMapper.class); job.setCombinerClass(VectorReducer.class); job.setReducerClass(VectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.waitForCompletion(true); } public static void generateDictionary(Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = CreateNewConfiguration(); Job job = new Job(conf, "Generating dataset based from input" + input); job.setJarByClass(VectorCreationJob.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); HadoopUtil.delete(conf, output); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(DictionaryMapper.class); job.setCombinerClass(DictionaryReducer.class); job.setReducerClass(DictionaryReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.waitForCompletion(true); } private static Configuration CreateNewConfiguration() { Configuration conf = new Configuration(); conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); return conf; } }