package com.skp.experiment.cf.als.hadoop; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.mahout.math.VectorWritable; import com.skp.experiment.common.mapreduce.IdentityMapper; import com.skp.experiment.common.mapreduce.IdentityReducer; import com.skp.experiment.common.mapreduce.MapFileOutputFormat; public class CreateMapFileFromSeq { private static Integer indexInterval = 1; /* get input path directory name and merge all sub files and make it as map file */ public static void createMapFile(Path seqFiles) throws Exception { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path merged = new Path(seqFiles.toString() + ".merged"); mergeSequenceFiles(seqFiles, merged); fs.delete(seqFiles, true); fs.rename(merged, seqFiles); //fs.rename(new Path(merged, "part-00000"), seqFiles); //fs.delete(merged, true); } public static void mergeSequenceFiles(Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException { /* JobConf conf = new JobConf(CreateMapFileFromSeq.class); conf.setJobName("Create Map File"); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(VectorWritable.class); conf.setMapperClass(org.apache.hadoop.mapred.lib.IdentityMapper.class); conf.setReducerClass(org.apache.hadoop.mapred.lib.IdentityReducer.class); conf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class); conf.setOutputFormat(MapFileOutputFormat.class); conf.setInt("io.map.index.interval", indexInterval); conf.setBoolean("mapred.output.compress", true); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.LzoCodec"); conf.setNumReduceTasks(1); conf.setPartitionerClass(UserIDPartitioner.class); FileInputFormat.setInputPaths(conf, input); FileOutputFormat.setOutputPath(conf, output); JobClient.runJob(conf); */ Configuration conf = new Configuration(); Job job = new Job(conf, "Create MapFileOutputFormat file."); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(IdentityMapper.class); job.setReducerClass(IdentityReducer.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(MapFileOutputFormat.class); job.getConfiguration().setInt("io.map.index.interval", indexInterval); //job.getConfiguration().setBoolean("mapred.output.compress", true); //job.getConfiguration().set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.LzoCodec"); //job.setPartitionerClass(UserIDPartitioner.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.waitForCompletion(true); } }