/** * Copyright 2011 Nube Technologies * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and limitations under the License. */ package co.nubetech.hiho.dedup; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Counters; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import co.nubetech.hiho.common.HIHOException; public class DedupJob extends Configured implements Tool { final static Logger logger = Logger .getLogger(co.nubetech.hiho.dedup.DedupJob.class); private String inputFormat = null; private String dedupBy = null; // "key" or "value" private String inputKeyClassName = null; private String inputValueClassName = null; private String inputPath = null; private String outputPath = null; private String outputFormat = null; private String delimiter = ","; private int column = 1; private long totalRecordsRead; private long badRecords; private long output; private long duplicateRecords; public long getTotalRecordsRead() { return totalRecordsRead; } public void setTotalRecordsRead(long totalRecordsRead) { this.totalRecordsRead = totalRecordsRead; } public long getBadRecords() { return badRecords; } public void setBadRecords(long badRecords) { this.badRecords = badRecords; } public long getOutput() { return output; } public void setOutput(long output) { this.output = output; } public long getDuplicateRecords() { return duplicateRecords; } public void setDuplicateRecords(long duplicateRecords) { this.duplicateRecords = duplicateRecords; } public void populateConfiguration(String[] args) { for (int i = 0; i < args.length - 1; i++) { if ("-inputFormat".equals(args[i])) { inputFormat = args[++i]; if (inputFormat .equals("org.apache.hadoop.mapreduce.lib.input.TextInputFormat")) { inputKeyClassName = "org.apache.hadoop.io.LongWritable"; inputValueClassName = "org.apache.hadoop.io.Text"; } else if (inputFormat .equals("co.nubetech.hiho.dedup.DelimitedTextInputFormat")) { inputKeyClassName = "org.apache.hadoop.io.Text"; inputValueClassName = "org.apache.hadoop.io.Text"; outputFormat = "co.nubetech.hiho.mapreduce.lib.output.NoKeyOnlyValueOutputFormat"; } else if (inputFormat .equals("org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat")) { outputFormat = "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat"; } } else if ("-dedupBy".equals(args[i])) { dedupBy = args[++i]; } else if ("-inputKeyClassName".equals(args[i])) { inputKeyClassName = args[++i]; } else if ("-inputValueClassName".equals(args[i])) { inputValueClassName = args[++i]; } else if ("-inputPath".equals(args[i])) { inputPath = args[++i]; } else if ("-outputPath".equals(args[i])) { outputPath = args[++i]; } else if ("-delimiter".equals(args[i])) { delimiter = args[++i]; } else if ("-column".equals(args[i])) { column = Integer.parseInt(args[++i]); } else if ("-outputFormat".equals(args[i])) { outputFormat = args[++i]; } } } public void checkMandatoryConfs() throws HIHOException { if (inputFormat == null) { throw new HIHOException( "The provided input format is empty, please specify inputFormat"); } if (dedupBy == null) { throw new HIHOException( "The provided value of dedupBy is empty, please specify either key or value"); } if ((!dedupBy.equals("key")) && (!dedupBy.equals("value"))) { throw new HIHOException( "The provided value of dedupBy is Incorrect, please specify either key or value"); } if (inputKeyClassName == null) { throw new HIHOException( "The provided input key class name is empty, please specify inputKeyClassName"); } if (inputValueClassName == null) { throw new HIHOException( "The provided input value class name is empty, please specify inputValueClassName"); } if (inputPath == null) { throw new HIHOException( "The provided input path is empty, please specify inputPath"); } if (outputPath == null) { throw new HIHOException( "The provided output path is empty, please specify outputPath"); } if (outputFormat == null) { System.out.println(outputFormat); throw new HIHOException( "The provided output format is empty, please specify outputFormat"); } } @Override public int run(String[] args) throws Exception { Configuration conf = getConf(); populateConfiguration(args); try { checkMandatoryConfs(); } catch (HIHOException e1) { e1.printStackTrace(); throw new Exception(e1); } Job job = new Job(conf); job.setJobName("Dedup job"); job.setJarByClass(DedupJob.class); Class inputFormatClass = Class.forName(inputFormat); Class outputFormatClass = Class.forName(outputFormat); Class inputKeyClass = Class.forName(inputKeyClassName); Class inputValueClass = Class.forName(inputValueClassName); if (dedupBy.equals("key")) { job.setMapperClass(DedupKeyMapper.class); job.setReducerClass(DedupKeyReducer.class); job.setMapOutputValueClass(inputValueClass); } else if (dedupBy.equals("value")) { job.setMapperClass(DedupValueMapper.class); job.setReducerClass(DedupValueReducer.class); job.setMapOutputValueClass(inputKeyClass); } job.setInputFormatClass(inputFormatClass); if (inputFormat .equals("co.nubetech.hiho.dedup.DelimitedTextInputFormat")) { DelimitedTextInputFormat.setProperties(job, delimiter, column); } job.setMapOutputKeyClass(HihoTuple.class); job.setOutputKeyClass(inputKeyClass); job.setOutputValueClass(inputValueClass); job.setPartitionerClass(HihoHashPartitioner.class); FileInputFormat.setInputPaths(job, inputPath); job.setOutputFormatClass(outputFormatClass); FileOutputFormat.setOutputPath(job, new Path(outputPath)); try { logger.debug("Output format class is " + job.getOutputFormatClass()); logger.debug("Class is " + ReflectionUtils .newInstance(job.getOutputFormatClass(), job.getConfiguration()).getClass() .getName()); job.waitForCompletion(false); if (job.isComplete()) { Counters counters = job.getCounters(); totalRecordsRead = counters.findCounter( DedupRecordCounter.TOTAL_RECORDS_READ).getValue(); badRecords = counters.findCounter( DedupRecordCounter.BAD_RECORD).getValue(); output = counters.findCounter(DedupRecordCounter.OUTPUT) .getValue(); duplicateRecords = totalRecordsRead - output; logger.info("Total records read are: " + totalRecordsRead); logger.info("Bad Records are: " + badRecords); logger.info("Output records are: " + output); logger.info("Duplicate records are: " + duplicateRecords); } } catch (Exception e) { e.printStackTrace(); } return 0; } public static void main(String[] args) throws Exception { DedupJob job = new DedupJob(); int res = ToolRunner.run(new Configuration(), job, args); System.exit(res); } }