package com.github.minyk.morphlinesmr.reducer;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import com.github.minyk.morphlinesmr.MorphlinesMRConfig;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.kitesdk.morphline.api.Command;
import org.kitesdk.morphline.api.MorphlineContext;
import org.kitesdk.morphline.api.Record;
import org.kitesdk.morphline.base.Fields;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Created by drake on 9/17/14.
*/
public class MorphlinesReducer extends Reducer<Text, Text, Text, Text> {
private static final Logger LOGGER = LoggerFactory.getLogger(MorphlinesReducer.class);
private static final String SEPERATOR = "\0001";
private Text value;
private final Record record = new Record();
private Command morphline;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
File morphLineFile = new File(context.getConfiguration().get(MorphlinesMRConfig.MORPHLINE_FILE));
String morphLineId = context.getConfiguration().get(MorphlinesMRConfig.MORPHLINE_ID);
ReducerRecordEmitter recordEmitter = new ReducerRecordEmitter(context);
MorphlineContext morphlineContext = new MorphlineContext.Builder().build();
morphline = new org.kitesdk.morphline.base.Compiler()
.compile(morphLineFile, morphLineId, morphlineContext, recordEmitter);
}
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
record.put("key", new ByteArrayInputStream(key.toString().getBytes()));
String strValues = "";
Iterator<Text> iter = values.iterator();
while(iter.hasNext()) {
value = iter.next();
strValues += value.toString() + SEPERATOR;
}
record.put("values", new ByteArrayInputStream(strValues.getBytes()));
if (!morphline.process(record)) {
LOGGER.info("Morphline failed to process record: {}", record);
}
record.removeAll(Fields.ATTACHMENT_BODY);
}
}