package org.openflamingo.mapreduce.etl.clean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.openflamingo.mapreduce.core.Delimiter;
import org.openflamingo.mapreduce.parser.CsvRowParser;
import org.openflamingo.mapreduce.util.ArrayUtils;
import org.openflamingo.mapreduce.util.CounterUtils;
import org.openflamingo.mapreduce.util.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
/**
* 지정한 칼럼을 삭제하는 Clean ETL Mapper.
* 이 Mapper는 지정한 칼럼을 삭제하고 다시 Delimiter를 이용하여 조립하고 Context Write를 한다.
*
* @author Edward KIM
* @author Seo Ji Hye
* @since 0.1
*/
public class CleanMapper extends Mapper<LongWritable, Text, NullWritable, Text> {
/**
* SLF4J Logging
*/
private static Logger logger = LoggerFactory.getLogger(CleanMapper.class);
/**
* 입력 Row를 컬럼으로 구분하기 위해서 사용하는 컬럼간 구분자
*/
private String inputDelimiter;
/**
* 출력 Row를 구성하기 위해서 사용하는 컬럼간 구분자
*/
private String outputDelimiter;
/**
* Row의 컬럼 개수
*/
private int columnSize;
/**
* 삭제할 컬럼의 정보
*/
private Integer[] columnsToClean;
/**
* CSV Row Parser
*/
private CsvRowParser parser;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
Configuration configuration = context.getConfiguration();
inputDelimiter = configuration.get("inputDelimiter", Delimiter.COMMA.getDelimiter());
outputDelimiter = configuration.get("outputDelimiter", Delimiter.COMMA.getDelimiter());
columnSize = configuration.getInt("columnSize", -1);
String[] stringArrayColumns = StringUtils.commaDelimitedListToStringArray(configuration.get("columnsToClean"));
if (columnSize == -1) {
throw new IllegalArgumentException("You must specify 'columnSize' for validating the column size.");
}
if (stringArrayColumns.length == 0) {
throw new IllegalArgumentException("You must specify 'columnsToClean' for cleaning some columns.");
}
columnsToClean = ArrayUtils.toIntegerArray(stringArrayColumns);
parser = new CsvRowParser(columnSize, inputDelimiter, outputDelimiter);
}
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
try {
parser.parse(value.toString());
CounterUtils.writerMapperCounter(this, "YES", context);
} catch (IllegalArgumentException ex) {
CounterUtils.writerMapperCounter(this, "NO", context);
CounterUtils.writerMapperCounter(this, "Wrong Column Size", context);
logger.warn("Wrong Column Size!! [Size: {}] [Value: {}]", columnSize, value.toString());
return;
}
parser.remove(columnsToClean);
context.write(NullWritable.get(), parser.toRowText());
}
}