package org.openflamingo.mapreduce.etl.grep;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.openflamingo.mapreduce.core.AbstractJob;
import org.openflamingo.mapreduce.core.Delimiter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Map;
import static org.openflamingo.mapreduce.core.Constants.JOB_FAIL;
import static org.openflamingo.mapreduce.core.Constants.JOB_SUCCESS;
/**
* 정규 표현식을 이용하여 로우를 Grep하는 Grep ETL Driver.
* 이 MapReduce ETL은 다음의 파라미터를 가진다.
* <ul>
* <li><tt>regEx (r)</tt> - 정규 표현식 (필수)</li>
* </ul>
*
* @author Edward KIM
* @author Seo Ji Hye
* @since 0.1
*/
public class GrepDriver extends AbstractJob {
/**
* SLF4J API
*/
private static final Logger logger = LoggerFactory.getLogger(GrepDriver.class);
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new GrepDriver(), args);
System.exit(res);
}
@Override
public int run(String[] args) throws Exception {
addInputOption();
addOutputOption();
addOption("inputDelimiter", "id", "입력 컬럼 구분자", Delimiter.COMMA.getDelimiter());
addOption("outputDelimiter", "od", "출력 컬럼 구분자", Delimiter.COMMA.getDelimiter());
addOption("columnSize", "cs", "컬럼의 개수", true);
addOption("grepMode", "gm", "Grep Mode 선택(ROW, COLUMN)", true);
addOption("columnsToGrep", "cg", "COLUMN모드시 grep할 컬럼 입력(0부터 시작)", false);
addOption("regEx", "re", "Grep할 정규 표현식", true);
Map<String, String> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return JOB_FAIL;
}
Job job = null;
String grepMode = parsedArgs.get("--grepMode");
if ("ROW".equalsIgnoreCase(grepMode)) {
// make row mapper job
job = prepareJob(getInputPath(), getOutputPath(),
TextInputFormat.class,
GrepRowMapper.class,
NullWritable.class,
Text.class,
TextOutputFormat.class);
} else if ("COLUMN".equalsIgnoreCase(grepMode)) {
// make column mapper job
job = prepareJob(getInputPath(), getOutputPath(),
TextInputFormat.class,
GrepColumnMapper.class,
NullWritable.class,
Text.class,
TextOutputFormat.class);
job.getConfiguration().set("columnsToGrep", parsedArgs.get("--columnsToGrep"));
} else {
throw new IllegalArgumentException("Grep Mode가 올바르지 않습니다. {}");
}
job.getConfiguration().set("inputDelimiter", parsedArgs.get("--inputDelimiter"));
job.getConfiguration().set("outputDelimiter", parsedArgs.get("--outputDelimiter"));
job.getConfiguration().set("columnSize", parsedArgs.get("--columnSize"));
job.getConfiguration().set("regEx", parsedArgs.get("--regEx"));
return job.waitForCompletion(true) ? JOB_SUCCESS : JOB_FAIL;
}
}