package eu.dnetlib.iis.wf.importer.content; import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.apache.avro.mapred.AvroKey; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Mapper; import org.apache.log4j.Logger; import com.google.common.base.Preconditions; import eu.dnetlib.iis.common.WorkflowRuntimeParameters; import eu.dnetlib.iis.common.javamapreduce.MultipleOutputs; import eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl; /** * {@link DocumentContentUrl} mime type based dispatcher writing {@link DocumentContentUrl} objects to dedicated output port. * @author mhorst * */ public class DocumentContentUrlDispatcher extends Mapper<AvroKey<DocumentContentUrl>, NullWritable, NullWritable, NullWritable> { private final static Logger log = Logger.getLogger(DocumentContentUrlDispatcher.class); protected static final String PROPERTY_PREFIX_MIMETYPES_CSV = "mimetypes.csv."; protected static final String PROPERTY_MULTIPLEOUTPUTS = "avro.mapreduce.multipleoutputs"; /** * Mime type to port name mappings. */ private Map<CharSequence,String> mimeTypeToPortNameMap; private MultipleOutputs mos; @Override protected void setup(Context context) throws IOException, InterruptedException { String multipleOutputStr = context.getConfiguration().get(PROPERTY_MULTIPLEOUTPUTS); Preconditions.checkArgument(StringUtils.isNotBlank(multipleOutputStr), "required parameter '%s' is missing!", PROPERTY_MULTIPLEOUTPUTS); this.mos = instantiateMultipleOutputs(context); this.mimeTypeToPortNameMap = new HashMap<CharSequence, String>(); // iterating through output port names and looking for mimetypes properties defined for each output port String[] portNames = StringUtils.split(context.getConfiguration().get(PROPERTY_MULTIPLEOUTPUTS)); for (String portName : portNames) { String currentMimeTypePropName = PROPERTY_PREFIX_MIMETYPES_CSV + portName; if (context.getConfiguration().get(currentMimeTypePropName) != null) { String[] currentPortMimeTypes = StringUtils.split( context.getConfiguration().get(currentMimeTypePropName), WorkflowRuntimeParameters.DEFAULT_CSV_DELIMITER); for (String currentPortMimeType : currentPortMimeTypes) { if (!currentPortMimeType.isEmpty() && !WorkflowRuntimeParameters.UNDEFINED_NONEMPTY_VALUE.equals(currentPortMimeType)) { this.mimeTypeToPortNameMap.put(currentPortMimeType.toLowerCase(), portName); } } } else { log.warn("undefined property '" + currentMimeTypePropName + "', no data will be dispatched to port '" + portName + "'"); } } } /** * Instantiates multiple outputs. */ protected MultipleOutputs instantiateMultipleOutputs(Context context) { return new MultipleOutputs(context); } @Override public void cleanup(Context context) throws IOException, InterruptedException { mos.close(); } @Override public void map(AvroKey<DocumentContentUrl> key, NullWritable ignore, Context context) throws IOException, InterruptedException { DocumentContentUrl currentRecord = key.datum(); if (currentRecord.getMimeType()!=null) { String lowercasedMimeType = currentRecord.getMimeType().toString().toLowerCase(); if (this.mimeTypeToPortNameMap.containsKey(lowercasedMimeType)) { mos.write(this.mimeTypeToPortNameMap.get(lowercasedMimeType), new AvroKey<DocumentContentUrl>(currentRecord)); } else { log.warn("skipping, got unhandled mime type: " + lowercasedMimeType + " for object: " + currentRecord.getId()); } } else { log.warn("got null mime type for object: " + currentRecord.getId()); } } }