package org.openflamingo.mapreduce.etl.statics;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.CounterGroup;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.openflamingo.mapreduce.core.AbstractJob;
import org.openflamingo.mapreduce.core.Delimiter;
import org.openflamingo.mapreduce.etl.linecount.LineCountMapper;
import org.openflamingo.mapreduce.type.DataType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Map;
import static org.openflamingo.mapreduce.core.Constants.JOB_FAIL;
import static org.openflamingo.mapreduce.core.Constants.JOB_SUCCESS;
/**
* 지정한 컬럼들을 기준으로 통계값을 계산하는 ETL.
*
* @author Edward KIM
* @author Seo Ji Hye
* @since 0.2
*/
public class StaticsDriver extends AbstractJob {
/**
* SLF4J API
*/
private static final Logger logger = LoggerFactory.getLogger(StaticsDriver.class);
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new StaticsDriver(), args);
System.exit(res);
}
@Override
public int run(String[] strings) throws Exception {
addInputOption();
addOutputOption();
addOption("inputDelimiter", "id", "입력 파일 컬럼 구분자", Delimiter.COMMA.getDelimiter());
addOption("outputDelimiter", "od", "출력 파일 컬럼 구분자", Delimiter.COMMA.getDelimiter());
addOption("columnsToStatics", "css", "통계값을 계산할 컬럼의 인덱스 입력", true);
addOption("staticsModes", "sm", "통계 종류 입력(MAX,MIN,AVERAGE,SUM,DEVIATION,STANDARD_DEVIATION,VARIANCE)", true);
addOption("dataTypes", "dt", "통계값을 계산할 컬럼값의 데이터 유형(기본값 long)", DataType.LONG.getDataType());
Map<String, String> parsedArgs = parseArguments(strings);
if (parsedArgs == null) {
return JOB_FAIL;
}
////////////////////////////////////////
// Line Count Hadoop Job
///////////////////////////////////////
// 임시 디렉토리를 가져온다. flamingo-mapreduce-site.xml 파일에 기본값이 정의되어 있다.
Path tempPathForLineCount = getTimestampTempPath();
logger.info("Temporary Path : {}", tempPathForLineCount.toString());
Job lineCountJob = prepareJob(
getInputPath(), tempPathForLineCount,
TextInputFormat.class, LineCountMapper.class,
NullWritable.class, Text.class,
TextOutputFormat.class);
boolean step1 = lineCountJob.waitForCompletion(true);
if (!step1) {
return JOB_FAIL;
}
////////////////////////////////////////////////
// Calculating a start number per input split
////////////////////////////////////////////////
// Counter에는 각 Mapper 별로 파일별 위치와 총 ROW의 개수를 포함하고 있다.
Counters counters = lineCountJob.getCounters();
CounterGroup group = counters.getGroup(LineCountMapper.class.getName());
long lineCount = 0;
for (Counter counter : group) {
// 총 row 갯수 계산
lineCount += counter.getValue();
}
////////////////////////////////////////
// Total Hadoop Job
///////////////////////////////////////
Path tempPathForSum = getTimestampTempPath();
logger.info("Temporary Path : {}", tempPathForSum.toString());
Job totalJob = prepareJob(getInputPath(), tempPathForSum,
TextInputFormat.class,
StaticsMapper.class,
NullWritable.class,
Text.class,
StaticsReducer.class,
NullWritable.class,
Text.class,
TextOutputFormat.class);
totalJob.getConfiguration().set("inputDelimiter", parsedArgs.get("--inputDelimiter"));
totalJob.getConfiguration().set("outputDelimiter", parsedArgs.get("--outputDelimiter"));
boolean step2 = totalJob.waitForCompletion(true);
if (!step2) {
return JOB_FAIL;
}
////////////////////////////////////////
// Statics Hadoop Job
///////////////////////////////////////
Job staticsJob = prepareJob(tempPathForSum, getOutputPath(),
TextInputFormat.class,
StaticsMapper.class,
LongWritable.class,
Text.class,
StaticsReducer.class,
NullWritable.class,
Text.class,
TextOutputFormat.class);
totalJob.getConfiguration().set("inputDelimiter", parsedArgs.get("--inputDelimiter"));
totalJob.getConfiguration().set("outputDelimiter", parsedArgs.get("--outputDelimiter"));
totalJob.getConfiguration().set("columnsToStatics", parsedArgs.get("--columnsToStatics"));
totalJob.getConfiguration().set("staticsModes", parsedArgs.get("--staticsModes"));
totalJob.getConfiguration().set("lineCount", String.valueOf(lineCount));
boolean step3 = staticsJob.waitForCompletion(true);
if (!step3) {
return JOB_FAIL;
}
try {
// 임시 경로를 삭제한다.
FileSystem.get(staticsJob.getConfiguration()).delete(tempPathForLineCount, true);
logger.info("Now removed {}", tempPathForSum.toString());
FileSystem.get(staticsJob.getConfiguration()).delete(tempPathForSum, true);
logger.info("Now removed {}", tempPathForSum.toString());
} catch (Exception ex) {
// Exception handling is not need.
}
return JOB_SUCCESS;
}
}