package com.skp.experiment.integeration.common; import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.ToolRunner; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.common.Pair; import org.apache.mahout.common.iterator.sequencefile.PathType; import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator; /** * This class define job that assign unique sequential id from 0 ~ x * data flow is following. * 1. first mapper assign row number in this partition id and emit row number using partition id as key * 2. first reducer get maximum row number for partition id * 3. sequence file dir iterator read how many rows in each partition and calculate offset for partitions. * 4. second mapper load offsets per partition id and read first mapper`s intermediate file. * emit row number in given partition id + offset for given partition id * @author doyoungYoon * */ public class SequentialIdGeneratorJob extends AbstractJob { //private static final Logger log = LoggerFactory.getLogger(SequentialIdGeneratorJob.class); public static final String RECORDS_PATH = SequentialIdGeneratorJob.class.getName() + ".recordsPath"; public static final String SUMMARY_PATH = SequentialIdGeneratorJob.class.getName() + ".summaryPath"; public static final String START_INDEX = SequentialIdGeneratorJob.class.getName() + ".startIndex"; public static long totalIdCount = 0; private static final String DELIMETER = ","; public static enum COUNT { TOTAL_ID_COUNT }; public static void main(String[] args) throws Exception { ToolRunner.run(new SequentialIdGeneratorJob(), args); } @Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption("cleanUp", "clean", "true if want to clean up intermediate files.", String.valueOf(true)); addOption("startIndex", "start", "start index.", String.valueOf(0)); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Path inputPath = getInputPath(); Path recordsPath = getTempPath("records"); Path summaryPath = getTempPath("summary"); FileSystem fs = FileSystem.get(getConf()); // 1. count how many records(lines) in each partition. // 2. store each lines in each partition into temp files. // step 2 is necessary because hadoop partition into into differenct partition id each time. // Job countJob = prepareJob(inputPath, summaryPath, TextInputFormat.class, CountPartitionRecordNumMapper.class, IntWritable.class, LongWritable.class, CountPartitionRecordNumReducer.class, IntWritable.class, LongWritable.class, SequenceFileOutputFormat.class); countJob.getConfiguration().set(RECORDS_PATH, recordsPath.toString()); countJob.setCombinerClass(CountPartitionRecordNumReducer.class); countJob.waitForCompletion(true); Job generateJob = prepareJob(recordsPath, getOutputPath(), SequenceFileInputFormat.class, AssignRecordIdMapper.class, NullWritable.class, Text.class, TextOutputFormat.class); generateJob.getConfiguration().set(SUMMARY_PATH, summaryPath.toString()); generateJob.getConfiguration().setLong(START_INDEX, getOption("startIndex") == null ? 0 :Long.parseLong(getOption("startIndex"))); generateJob.waitForCompletion(true); // clean up if (getOption("cleanUp").equals("true")) { if (fs.exists(recordsPath)) { fs.delete(recordsPath, true); } if (fs.exists(summaryPath)) { fs.delete(summaryPath, true); } fs.deleteOnExit(getTempPath()); } // record how many id has been created totalIdCount = generateJob.getCounters().findCounter(SequentialIdGeneratorJob.COUNT.TOTAL_ID_COUNT).getValue(); return 0; } /** * 1. count record number per partitionID. * 2. store <partition_id, current partition`s line> into temp file. */ public static class CountPartitionRecordNumMapper extends Mapper<LongWritable, Text, IntWritable, LongWritable> { private static long count = 0; private static long curId = 0; private static IntWritable partitionId = new IntWritable(0); private static Text newKey = new Text(); private SequenceFile.Writer writer; protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); int id = conf.getInt("mapred.task.partition", -1); partitionId.set(id); // get records directory String dir = conf.get(RECORDS_PATH); // set up sequence file for this partition. FileSystem fs = FileSystem.get(conf); Path writerPath = new Path(dir, String.format("records%05d", partitionId.get())); writer = SequenceFile.createWriter(fs, conf, writerPath, Text.class, Text.class); //initialize curId curId = 0; } @Override protected void cleanup(Context context) throws IOException, InterruptedException { IOUtils.closeStream(writer); } public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // note: if line count for this partition should be 1 based. // ex) if there are 10 records in this partition, count should be 10 // but id should be 9 if we want to use 0 based id context.write(partitionId, new LongWritable(++count)); newKey.set(partitionId.get() + DELIMETER + curId++); //System.out.println(newKey.toString()); writer.append(newKey, value); } } /** * get max value for partition id */ public static class CountPartitionRecordNumReducer extends Reducer<IntWritable, LongWritable, IntWritable, LongWritable> { public void reduce(IntWritable key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { long max = 0; for (LongWritable val : values) { if (val.get() > max) max = val.get(); } context.write(key, new LongWritable(max)); } } public static class AssignRecordIdMapper extends Mapper<Text, Text, NullWritable, Text> { private static Text outValue = new Text(); Map<Integer, Long> offsets; @Override protected void setup(Context context) throws IOException, InterruptedException { Path summaryPath = new Path(context.getConfiguration().get(SUMMARY_PATH)); long startIndex = context.getConfiguration().getLong(START_INDEX, 0); offsets = buildOffsets(summaryPath, startIndex); } private Map<Integer, Long> buildOffsets(Path input, long startIndex) throws IOException { Map<Integer, Long> offsets = new HashMap<Integer, Long>(); SequenceFileDirIterator<IntWritable, LongWritable> iter = new SequenceFileDirIterator<IntWritable, LongWritable>(new Path(input + "/part*"), PathType.GLOB, null, null, true, new Configuration()); long cusum = startIndex; while (iter.hasNext()) { Pair<IntWritable, LongWritable> e = iter.next(); int partitionId = e.getFirst().get(); long currentLineNum = e.getSecond().get(); offsets.put(partitionId, cusum); cusum += currentLineNum; } return offsets; } @Override public void map(Text key, Text value, Context context) throws IOException, InterruptedException { // key is consist of partitionId, line number in this partition String[] tokens = key.toString().split(DELIMETER); int partitionId = Integer.parseInt(tokens[0]); long lineNumInPartition = Long.parseLong(tokens[1]); if (offsets.containsKey(partitionId)) { long curId = lineNumInPartition + offsets.get(partitionId); outValue.set(curId + DELIMETER + value); context.write(NullWritable.get(), outValue); context.getCounter(SequentialIdGeneratorJob.COUNT.TOTAL_ID_COUNT).increment(1); } } } }