package org.openflamingo.mapreduce.etl.statics; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.CounterGroup; import org.apache.hadoop.mapreduce.Counters; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.ToolRunner; import org.openflamingo.mapreduce.core.AbstractJob; import org.openflamingo.mapreduce.core.Delimiter; import org.openflamingo.mapreduce.etl.linecount.LineCountMapper; import org.openflamingo.mapreduce.type.DataType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Map; import static org.openflamingo.mapreduce.core.Constants.JOB_FAIL; import static org.openflamingo.mapreduce.core.Constants.JOB_SUCCESS; /** * 지정한 컬럼들을 기준으로 통계값을 계산하는 ETL. * * @author Edward KIM * @author Seo Ji Hye * @since 0.2 */ public class StaticsDriver extends AbstractJob { /** * SLF4J API */ private static final Logger logger = LoggerFactory.getLogger(StaticsDriver.class); public static void main(String[] args) throws Exception { int res = ToolRunner.run(new StaticsDriver(), args); System.exit(res); } @Override public int run(String[] strings) throws Exception { addInputOption(); addOutputOption(); addOption("inputDelimiter", "id", "입력 파일 컬럼 구분자", Delimiter.COMMA.getDelimiter()); addOption("outputDelimiter", "od", "출력 파일 컬럼 구분자", Delimiter.COMMA.getDelimiter()); addOption("columnsToStatics", "css", "통계값을 계산할 컬럼의 인덱스 입력", true); addOption("staticsModes", "sm", "통계 종류 입력(MAX,MIN,AVERAGE,SUM,DEVIATION,STANDARD_DEVIATION,VARIANCE)", true); addOption("dataTypes", "dt", "통계값을 계산할 컬럼값의 데이터 유형(기본값 long)", DataType.LONG.getDataType()); Map<String, String> parsedArgs = parseArguments(strings); if (parsedArgs == null) { return JOB_FAIL; } //////////////////////////////////////// // Line Count Hadoop Job /////////////////////////////////////// // 임시 디렉토리를 가져온다. flamingo-mapreduce-site.xml 파일에 기본값이 정의되어 있다. Path tempPathForLineCount = getTimestampTempPath(); logger.info("Temporary Path : {}", tempPathForLineCount.toString()); Job lineCountJob = prepareJob( getInputPath(), tempPathForLineCount, TextInputFormat.class, LineCountMapper.class, NullWritable.class, Text.class, TextOutputFormat.class); boolean step1 = lineCountJob.waitForCompletion(true); if (!step1) { return JOB_FAIL; } //////////////////////////////////////////////// // Calculating a start number per input split //////////////////////////////////////////////// // Counter에는 각 Mapper 별로 파일별 위치와 총 ROW의 개수를 포함하고 있다. Counters counters = lineCountJob.getCounters(); CounterGroup group = counters.getGroup(LineCountMapper.class.getName()); long lineCount = 0; for (Counter counter : group) { // 총 row 갯수 계산 lineCount += counter.getValue(); } //////////////////////////////////////// // Total Hadoop Job /////////////////////////////////////// Path tempPathForSum = getTimestampTempPath(); logger.info("Temporary Path : {}", tempPathForSum.toString()); Job totalJob = prepareJob(getInputPath(), tempPathForSum, TextInputFormat.class, StaticsMapper.class, NullWritable.class, Text.class, StaticsReducer.class, NullWritable.class, Text.class, TextOutputFormat.class); totalJob.getConfiguration().set("inputDelimiter", parsedArgs.get("--inputDelimiter")); totalJob.getConfiguration().set("outputDelimiter", parsedArgs.get("--outputDelimiter")); boolean step2 = totalJob.waitForCompletion(true); if (!step2) { return JOB_FAIL; } //////////////////////////////////////// // Statics Hadoop Job /////////////////////////////////////// Job staticsJob = prepareJob(tempPathForSum, getOutputPath(), TextInputFormat.class, StaticsMapper.class, LongWritable.class, Text.class, StaticsReducer.class, NullWritable.class, Text.class, TextOutputFormat.class); totalJob.getConfiguration().set("inputDelimiter", parsedArgs.get("--inputDelimiter")); totalJob.getConfiguration().set("outputDelimiter", parsedArgs.get("--outputDelimiter")); totalJob.getConfiguration().set("columnsToStatics", parsedArgs.get("--columnsToStatics")); totalJob.getConfiguration().set("staticsModes", parsedArgs.get("--staticsModes")); totalJob.getConfiguration().set("lineCount", String.valueOf(lineCount)); boolean step3 = staticsJob.waitForCompletion(true); if (!step3) { return JOB_FAIL; } try { // 임시 경로를 삭제한다. FileSystem.get(staticsJob.getConfiguration()).delete(tempPathForLineCount, true); logger.info("Now removed {}", tempPathForSum.toString()); FileSystem.get(staticsJob.getConfiguration()).delete(tempPathForSum, true); logger.info("Now removed {}", tempPathForSum.toString()); } catch (Exception ex) { // Exception handling is not need. } return JOB_SUCCESS; } }