package eu.dnetlib.iis.wf.importer.content;
import java.io.IOException;
import org.apache.avro.mapred.AvroKey;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;
import eu.dnetlib.data.objectstore.rmi.ObjectStoreFile;
import eu.dnetlib.iis.common.schemas.Identifier;
import eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl;
import eu.dnetlib.iis.wf.importer.facade.ObjectStoreFacade;
import eu.dnetlib.iis.wf.importer.facade.ServiceFacadeException;
import eu.dnetlib.iis.wf.importer.facade.ServiceFacadeUtils;
/**
* {@link Identifier} based ObjectStore records importer producing {@link DocumentContentUrl} output.
*
* @author mhorst
*
*/
public class ObjectStoreDocumentContentUrlImporterMapper extends Mapper<AvroKey<Identifier>, NullWritable, AvroKey<DocumentContentUrl>, NullWritable> {
private static final Logger log = Logger.getLogger(ObjectStoreDocumentContentUrlImporterMapper.class);
/**
* Progress log interval.
*/
private static final int PROGRESS_LOG_INTERVAL = 100000;
/**
* MDStore service facade.
*/
private ObjectStoreFacade objectStoreFacade;
//------------------------ LOGIC --------------------------
@Override
protected void setup(Context context) throws IOException, InterruptedException {
try {
objectStoreFacade = ServiceFacadeUtils.instantiate(context.getConfiguration());
} catch (ServiceFacadeException e) {
throw new RuntimeException("unable to instantiate MDStore service facade", e);
}
}
@Override
protected void map(AvroKey<Identifier> key, NullWritable value, Context context) throws IOException, InterruptedException {
try {
String objectStoreId = key.datum().getId().toString();
long startTime = System.currentTimeMillis();
long intervalTime = startTime;
int recordIndex=0;
log.info("starting importing process from object store: " + objectStoreId);
for (String record : objectStoreFacade.deliverObjects(objectStoreId, 0l, System.currentTimeMillis())) {
context.write(new AvroKey<DocumentContentUrl>(buildRecord(record)), NullWritable.get());
if (recordIndex>0 && recordIndex%PROGRESS_LOG_INTERVAL==0) {
log.info("content retrieval progress: " + recordIndex + ", time taken to process " +
PROGRESS_LOG_INTERVAL + " elements: " + ((System.currentTimeMillis() - intervalTime)/1000) + " secs");
intervalTime = System.currentTimeMillis();
}
recordIndex++;
}
log.info("URL importing process from object store: " + objectStoreId + " has finished");
} catch (ServiceFacadeException e) {
throw new IOException("Unable to deliver objects using ObjectStore facade", e);
}
}
// ------------------------ PRIVATE --------------------------
/**
* Builds {@link DocumentContentUrl} record out of ObjectStore metadata record encoded as JSON.
*
* @param metaJsonRecord metadata record obtained from ObjectStore
*/
private DocumentContentUrl buildRecord(String metaJsonRecord) {
ObjectStoreFile objStoreFile = ObjectStoreFile.createObject(metaJsonRecord);
String resultId = ObjectStoreContentProviderUtils.extractResultIdFromObjectId(objStoreFile.getObjectID());
DocumentContentUrl.Builder documentContentUrlBuilder = DocumentContentUrl.newBuilder();
documentContentUrlBuilder.setId(resultId);
documentContentUrlBuilder.setUrl(objStoreFile.getURI());
documentContentUrlBuilder.setMimeType(objStoreFile.getMimeType());
documentContentUrlBuilder.setContentChecksum(objStoreFile.getMd5Sum());
documentContentUrlBuilder.setContentSizeKB(objStoreFile.getFileSizeKB());
return documentContentUrlBuilder.build();
}
}