package eu.dnetlib.iis.common.javamapreduce; import java.io.Closeable; import java.io.IOException; import org.apache.avro.mapred.AvroKey; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.TaskInputOutputContext; import eu.dnetlib.iis.common.javamapreduce.hack.AvroMultipleOutputs; /** * Helper class for writing data to multiple outputs in accordance with our * conventions * * @author Mateusz Kobos * */ public class MultipleOutputs implements Closeable { private final AvroMultipleOutputs mos; public MultipleOutputs(TaskInputOutputContext<?, ?, ?, ?> context) { this.mos = new AvroMultipleOutputs(context); createOutputFiles(this.mos); } /** Create output files for each of named outputs by initializing writers * assigned to each of them. This is a HACK. * * Originally in the {@link AvroMultipleOutputs} class, in a situation when * there are no records in the input data, there were no output files * generated. This is bad, because according to our conventions, a workflow * node should always produce data stores it defines as its output, may they * be filled with records or empty. Producing nothing at all complicates * things and would have to be separately handled by a downstream workflow * node which would introduce unneeded complexity. * * The output records are created by writers associated with * each named output ("port" in the parlance of our conventions). A given output * file is created during writer initialization. However, the initialization * is done lazily in the {@code AvroMultipleOutputs.write} method, i.e. * the writer is initialized at the first attempt of writing something. If * there are no input records, the map function of a mapper that writes * something is not called even once and the initialization doesn't happen. * * Here we're essentially undoing this lazy initialization by explicitly * initializing the writers in the constructor. */ private static void createOutputFiles(AvroMultipleOutputs mos){ for(String namedOutput: mos.getNamedOutputs()){ TaskAttemptContext taskContext; try { taskContext = mos.getContext(namedOutput); mos.getRecordWriter(taskContext, getPortOutputPath(namedOutput)); } catch (IOException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } } } /** * Write record embedded in {@code AvroKey} object to data store related to * given port */ public <T> void write(String portName, AvroKey<T> record) throws IOException, InterruptedException { mos.write(portName, record, NullWritable.get(), getPortOutputPath(portName)); } private static String getPortOutputPath(String portName){ return portName + "/part"; } @Override public void close() throws IOException { try { this.mos.close(); } catch (InterruptedException e) { throw new RuntimeException(e); } } public TaskAttemptContext getContext(String nameOutput) throws IOException { return this.mos.getContext(nameOutput); } @SuppressWarnings("rawtypes") public RecordWriter getRecordWriter(TaskAttemptContext taskContext, String baseFileName) throws IOException, InterruptedException{ return this.mos.getRecordWriter(taskContext, baseFileName); } }