package org.openflamingo.mapreduce.etl.rank; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import org.openflamingo.mapreduce.core.Delimiter; import org.openflamingo.mapreduce.parser.CsvRowParser; import java.io.IOException; import java.util.Iterator; import java.util.Set; import java.util.TreeMap; /** * * 특정 컬럼을 기준으로 Sort된 파일에 순위를 기록하는 Rank ETL. * 최하 순위를 지정하여 이하의 순위들은 기록하지 않을 수 있다. * * @author Edward KIM * @author Seo Ji Hye * @since 0.1 */ public class RankReducer extends Reducer<NullWritable, Text, NullWritable, Text> { /** * */ private String inputDelimiter; /** * */ private String outputDelimiter; /** * 랭킹 시작 번호 */ private int startNumber; /** * */ private int generatedSequenceIndex; /** * */ private int keyColumn; /** * */ private int topK; protected void setup(Context context) throws IOException, InterruptedException { Configuration configuration = context.getConfiguration(); outputDelimiter = configuration.get("outputDelimiter", Delimiter.COMMA.getDelimiter()); inputDelimiter = configuration.get("inputDelimiter", Delimiter.COMMA.getDelimiter()); generatedSequenceIndex = configuration.getInt("generatedSequenceIndex", -1); startNumber = configuration.getInt("startNumber", 1); keyColumn = configuration.getInt("columnToRank", -1); topK = configuration.getInt("topK", -1); if (generatedSequenceIndex == -1) { throw new IllegalArgumentException("sequence index 오류. sequence의 index를 확인해 주세요."); } if (keyColumn == -1) { throw new IllegalArgumentException("-columnToRank의 index값을 다시 입력해 주세요. index는 0부터 시작 됩니다."); } } public void reduce(NullWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { Iterator<Text> iterator = values.iterator(); TreeMap map = new TreeMap(); CsvRowParser parser = new CsvRowParser(); parser.setInputDelimiter(inputDelimiter); while (iterator.hasNext()) { Text value = iterator.next(); parser.parse(value.toString()); String sequenceNumber = parser.get(generatedSequenceIndex); parser.remove(generatedSequenceIndex); // remove Generate Sequence map.put(sequenceNumber, parser.toRow()); } Set keySet = map.keySet(); int rank = startNumber; String beforeKeyColumn = ""; for (Iterator keySetIterator = keySet.iterator(); keySetIterator.hasNext(); ) { String line = (String) map.get(keySetIterator.next()); parser.parse(line); if (!beforeKeyColumn.equals(parser.get(keyColumn))) { // columnToRank들이 다른지 비교 rank = startNumber; } if (topK == -1 || rank <= topK) { context.write(NullWritable.get(), new Text(line + outputDelimiter + rank)); } beforeKeyColumn = parser.get(keyColumn); rank++; } } }