package eu.dnetlib.iis.wf.importer.content;
import static eu.dnetlib.iis.wf.importer.ImportWorkflowRuntimeParameters.IMPORT_CONTENT_CONNECTION_TIMEOUT;
import static eu.dnetlib.iis.wf.importer.ImportWorkflowRuntimeParameters.IMPORT_CONTENT_MAX_FILE_SIZE_MB;
import static eu.dnetlib.iis.wf.importer.ImportWorkflowRuntimeParameters.IMPORT_CONTENT_READ_TIMEOUT;
import java.io.IOException;
import org.apache.avro.mapred.AvroKey;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;
import eu.dnetlib.iis.common.WorkflowRuntimeParameters;
import eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl;
import eu.dnetlib.iis.metadataextraction.schemas.DocumentText;
/**
* {@link DocumentContentUrl} based importer producing {@link DocumentText} output.
* @author mhorst
*
*/
public class DocumentTextUrlBasedImporterMapper extends Mapper<AvroKey<DocumentContentUrl>, NullWritable, AvroKey<DocumentText>, NullWritable> {
private static final Logger log = Logger.getLogger(DocumentContentUrlBasedImporterMapper.class);
/**
* Maximum content size in kilobytes.
*/
private long maxFileSizeKB = Long.MAX_VALUE;
/**
* Connection timeout.
*/
private int connectionTimeout;
/**
* Read timeout.
*/
private int readTimeout;
/**
* Hadoop counters enum of invalid records
*/
public static enum InvalidRecordCounters {
SIZE_EXCEEDED,
SIZE_INVALID
}
@Override
protected void setup(Context context) {
// connection related parameters
this.connectionTimeout = context.getConfiguration().getInt(
IMPORT_CONTENT_CONNECTION_TIMEOUT, 60000);
this.readTimeout = context.getConfiguration().getInt(
IMPORT_CONTENT_READ_TIMEOUT, 60000);
// handling maximum content size
Integer maxFileSizeMB = WorkflowRuntimeParameters.getIntegerParamValue(
IMPORT_CONTENT_MAX_FILE_SIZE_MB, context.getConfiguration());
if (maxFileSizeMB != null) {
this.maxFileSizeKB = 1024l * maxFileSizeMB;
}
context.getCounter(InvalidRecordCounters.SIZE_EXCEEDED).setValue(0);
context.getCounter(InvalidRecordCounters.SIZE_INVALID).setValue(0);
}
/**
* Provides contents for given url.
*/
protected byte[] getContent(String url) throws IOException, InvalidSizeException {
return ObjectStoreContentProviderUtils.getContentFromURL(
url, this.connectionTimeout, this.readTimeout);
}
@Override
protected void map(AvroKey<DocumentContentUrl> key, NullWritable value,
Context context) throws IOException, InterruptedException {
DocumentContentUrl docUrl = key.datum();
if (docUrl.getContentSizeKB() <= 0) {
log.warn("content " + docUrl.getId() + " discarded for location: " + docUrl.getUrl()
+ " and size [kB]: " + docUrl.getContentSizeKB() + ", size is expected to be greater than 0!");
context.getCounter(InvalidRecordCounters.SIZE_INVALID).increment(1);
} else if (docUrl.getContentSizeKB() <= maxFileSizeKB) {
try {
long startTimeContent = System.currentTimeMillis();
byte[] textContent = getContent(docUrl.getUrl().toString());
log.info("text content retrieval for id: " + docUrl.getId() +
" and location: " + docUrl.getUrl() + " took: " +
(System.currentTimeMillis()-startTimeContent) + " ms");
DocumentText.Builder documentTextBuilder = DocumentText.newBuilder();
documentTextBuilder.setId(docUrl.getId());
documentTextBuilder.setText(new String(textContent,
ObjectStoreContentProviderUtils.defaultEncoding));
context.write(new AvroKey<DocumentText>(documentTextBuilder.build()), NullWritable.get());
} catch (InvalidSizeException e) {
log.warn("content " + docUrl.getId() + " discarded for location: " + docUrl.getUrl()
+ ", real size is expected to be greater than 0!");
context.getCounter(InvalidRecordCounters.SIZE_INVALID).increment(1);
}
} else {
context.getCounter(InvalidRecordCounters.SIZE_EXCEEDED).increment(1);
log.warn("skipping processing for id " + docUrl.getId()
+ " due to max file size limit=" + maxFileSizeKB
+ " KB exceeded: " + docUrl.getContentSizeKB() + " KB");
}
}
}