package eu.dnetlib.iis.wf.importer.content; import static eu.dnetlib.iis.wf.importer.ImportWorkflowRuntimeParameters.IMPORT_CONTENT_CONNECTION_TIMEOUT; import static eu.dnetlib.iis.wf.importer.ImportWorkflowRuntimeParameters.IMPORT_CONTENT_MAX_FILE_SIZE_MB; import static eu.dnetlib.iis.wf.importer.ImportWorkflowRuntimeParameters.IMPORT_CONTENT_READ_TIMEOUT; import java.io.IOException; import java.nio.ByteBuffer; import org.apache.avro.mapred.AvroKey; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Mapper; import org.apache.log4j.Logger; import eu.dnetlib.iis.common.WorkflowRuntimeParameters; import eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl; import eu.dnetlib.iis.importer.schemas.DocumentContent; /** * {@link DocumentContentUrl} based importer producing {@link DocumentContent} output. * * @author mhorst * */ public class DocumentContentUrlBasedImporterMapper extends Mapper<AvroKey<DocumentContentUrl>, NullWritable, AvroKey<DocumentContent>, NullWritable> { private static final Logger log = Logger.getLogger(DocumentContentUrlBasedImporterMapper.class); /** * Connection timeout. */ private int connectionTimeout; /** * Read timeout. */ private int readTimeout; /** * Maximum allowed file size expressed in KB. */ private long maxFileSizeKB = Long.MAX_VALUE; /** * Counter for the records with content size exceeded. */ private Counter sizeExceededCounter; /** * Counter for the records with invalid size: less or equal 0. */ private Counter sizeInvalidCounter; /** * Hadoop counters enum of invalid records */ public static enum InvalidRecordCounters { SIZE_EXCEEDED, SIZE_INVALID } //------------------------ LOGIC -------------------------- @Override protected void setup(Context context) throws IOException, InterruptedException { // connection and approver related parameters this.connectionTimeout = context.getConfiguration().getInt(IMPORT_CONTENT_CONNECTION_TIMEOUT, 60000); this.readTimeout = context.getConfiguration().getInt(IMPORT_CONTENT_READ_TIMEOUT, 60000); this.sizeInvalidCounter = context.getCounter(InvalidRecordCounters.SIZE_INVALID); this.sizeInvalidCounter.setValue(0); this.sizeExceededCounter = context.getCounter(InvalidRecordCounters.SIZE_EXCEEDED); this.sizeExceededCounter.setValue(0); Integer maxFileSizeMB = WorkflowRuntimeParameters.getIntegerParamValue( IMPORT_CONTENT_MAX_FILE_SIZE_MB, context.getConfiguration()); if (maxFileSizeMB != null) { this.maxFileSizeKB = 1024l * maxFileSizeMB; } } /** * Provides contents for given url. */ protected byte[] getContent(String url) throws IOException, InvalidSizeException { return ObjectStoreContentProviderUtils.getContentFromURL( url, this.connectionTimeout, this.readTimeout); } @Override protected void map(AvroKey<DocumentContentUrl> key, NullWritable value, Context context) throws IOException, InterruptedException { DocumentContentUrl docUrl = key.datum(); if (docUrl.getContentSizeKB() <= 0) { log.warn("content " + docUrl.getId() + " discarded for location: " + docUrl.getUrl() + " and size [kB]: " + docUrl.getContentSizeKB() + ", size is expected to be greater than 0!"); this.sizeInvalidCounter.increment(1); } else if (docUrl.getContentSizeKB() <= maxFileSizeKB) { long startTimeContent = System.currentTimeMillis(); log.info("starting content retrieval for id: " + docUrl.getId() + ", location: " + docUrl.getUrl() + " and size [kB]: " + docUrl.getContentSizeKB()); try { byte[] content = getContent(docUrl.getUrl().toString()); DocumentContent.Builder documentContentBuilder = DocumentContent.newBuilder(); documentContentBuilder.setId(docUrl.getId()); documentContentBuilder.setPdf(ByteBuffer.wrap(content)); context.write(new AvroKey<DocumentContent>(documentContentBuilder.build()), NullWritable.get()); log.info("content retrieval for id: " + docUrl.getId() + " took: " + (System.currentTimeMillis() - startTimeContent) + " ms"); } catch (InvalidSizeException e) { log.warn("content " + docUrl.getId() + " discarded for location: " + docUrl.getUrl() + ", real size is expected to be greater than 0!"); this.sizeInvalidCounter.increment(1); } } else { this.sizeExceededCounter.increment(1); log.info("content " + docUrl.getId() + " discarded for location: " + docUrl.getUrl() + " and size [kB]: " + docUrl.getContentSizeKB() + ", size limit: " + maxFileSizeKB + " exceeded!"); } } }