package org.openflamingo.mapreduce.etl.grep; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.openflamingo.mapreduce.core.Delimiter; import org.openflamingo.mapreduce.parser.CsvRowParser; import org.openflamingo.mapreduce.util.ArrayUtils; import org.openflamingo.mapreduce.util.CounterUtils; import org.openflamingo.mapreduce.util.StringUtils; import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * 정규 표현식을 이용하여 로우를 Grep하는 Grep ETL 매퍼 * * @author Edward KIM * @author Seo Ji Hye * @since 0.1 */ public class GrepColumnMapper extends Mapper<LongWritable, Text, NullWritable, Text> { /** * 입력 Row를 컬럼으로 구분하기 위해서 사용하는 컬럼간 구분자 */ private String inputDelimiter; /** * 출력 Row를 구성하기 위해서 사용하는 컬럼간 구분자 */ private String outputDelimiter; /** * Grep할 컬럼들의 인덱스 배열 */ private Integer[] columnsToGrep; /** * Row의 컬럼 개수 */ private int columnSize; /** * CSV Row Parser */ private CsvRowParser parser; /** * 정규 표현식 */ private String[] regEx; /** * 정규 표현식 패턴 */ private Pattern pattern; @Override protected void setup(Context context) throws IOException, InterruptedException { Configuration configuration = context.getConfiguration(); inputDelimiter = configuration.get("inputDelimiter", Delimiter.COMMA.getDelimiter()); outputDelimiter = configuration.get("outputDelimiter", Delimiter.COMMA.getDelimiter()); columnSize = configuration.getInt("columnSize", -1); if (columnSize == -1) { throw new IllegalArgumentException("You must specify 'columnSize' for validating the column size."); } regEx = StringUtils.commaDelimitedListToStringArray(configuration.get("regEx")); String[] stringArrayColumns = StringUtils.commaDelimitedListToStringArray(configuration.get("columnsToGrep")); if (stringArrayColumns.length == 0) { throw new IllegalArgumentException("You must specify 'columnsToFilter' for cleaning some columns."); } columnsToGrep = ArrayUtils.toIntegerArray(stringArrayColumns); if (regEx.length != columnsToGrep.length) { throw new IllegalArgumentException("Invalid Parameter Length"); } parser = new CsvRowParser(columnSize, inputDelimiter, outputDelimiter); } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { parser.parse(value.toString()); int i = 0; int count = 0; while (i < columnsToGrep.length) { pattern = Pattern.compile(regEx[i]); Matcher matcher = pattern.matcher(parser.get(columnsToGrep[i])); // columns들이 모두 match 되었는지 검사 if (matcher.find()) { count++; } i++; } if (count == columnsToGrep.length) { CounterUtils.writerMapperCounter(this, "YES", context); context.write(NullWritable.get(), new Text(value)); } else { CounterUtils.writerMapperCounter(this, "NO", context); } } }