package eu.dnetlib.iis.wf.importer.dataset;
import java.io.IOException;
import java.io.StringReader;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.avro.mapred.AvroKey;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import eu.dnetlib.iis.common.javamapreduce.MultipleOutputs;
import eu.dnetlib.iis.common.schemas.Identifier;
import eu.dnetlib.iis.importer.schemas.DataSetReference;
import eu.dnetlib.iis.importer.schemas.DatasetToMDStore;
import eu.dnetlib.iis.wf.importer.RecordReceiver;
import eu.dnetlib.iis.wf.importer.facade.MDStoreFacade;
import eu.dnetlib.iis.wf.importer.facade.ServiceFacadeException;
import eu.dnetlib.iis.wf.importer.facade.ServiceFacadeUtils;
/**
* Mapper importing dataset records from MDStore identified with {@link Identifier} provided at input.
*
* @author mhorst
*
*/
public class DatasetImporterMapper extends Mapper<AvroKey<Identifier>, NullWritable, NullWritable, NullWritable> {
private static final Logger log = Logger.getLogger(DatasetImporterMapper.class);
/**
* Logging interval.
*/
private final static int progressLogInterval = 100000;
/**
* Multiple outputs.
*/
private MultipleOutputs mos;
/**
* Dataset output.
*/
private String namedOutputDataset;
/**
* Dataset to MDStore relation output.
*/
private String namedOutputDatasetToMDStore;
/**
* Sax parser to be used for datset metadata extraction.
*/
private SAXParser saxParser;
/**
* MDStore service facade.
*/
private MDStoreFacade mdStoreFacade;
//------------------------ LOGIC --------------------------
@Override
protected void setup(Context context) throws IOException, InterruptedException {
namedOutputDataset = context.getConfiguration().get("output.dataset");
if (namedOutputDataset == null || namedOutputDataset.isEmpty()) {
throw new RuntimeException("no named output provided for dataset");
}
namedOutputDatasetToMDStore = context.getConfiguration().get("output.dataset_to_mdstore");
if (namedOutputDatasetToMDStore == null || namedOutputDatasetToMDStore.isEmpty()) {
throw new RuntimeException("no named output provided for dataset to mdstore relations");
}
mos = new MultipleOutputs(context);
SAXParserFactory parserFactory = SAXParserFactory.newInstance();
parserFactory.setNamespaceAware(true);
try {
saxParser = parserFactory.newSAXParser();
mdStoreFacade = ServiceFacadeUtils.instantiate(context.getConfiguration());
} catch (ParserConfigurationException | SAXException e) {
throw new RuntimeException(e);
} catch (ServiceFacadeException e) {
throw new RuntimeException("unable to instantiate MDStore service facade", e);
}
}
@Override
public void cleanup(Context context) throws IOException, InterruptedException {
mos.close();
}
@Override
public void map(AvroKey<Identifier> key, NullWritable ignore, Context context)
throws IOException, InterruptedException {
try {
String mdStoreId = key.datum().getId().toString();
long startTime = System.currentTimeMillis();
int currentCount = 0;
for (String record : mdStoreFacade.deliverMDRecords(mdStoreId)) {
DataciteDumpXmlHandler handler = new DataciteDumpXmlHandler(
new RecordReceiver<DataSetReference>() {
@Override
public void receive(DataSetReference object) throws IOException {
try {
mos.write(namedOutputDataset, new AvroKey<DataSetReference>(object));
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
},
new RecordReceiver<DatasetToMDStore>() {
@Override
public void receive(DatasetToMDStore object) throws IOException {
try {
mos.write(namedOutputDatasetToMDStore, new AvroKey<DatasetToMDStore>(object));
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}, mdStoreId);
saxParser.parse(new InputSource(new StringReader(record)), handler);
currentCount++;
if (currentCount % progressLogInterval == 0) {
log.info("current progress: " + currentCount + ", last package of " + progressLogInterval
+ " processed in " + ((System.currentTimeMillis() - startTime) / 1000) + " secs");
startTime = System.currentTimeMillis();
}
}
log.info("total number of processed records for mdstore " + mdStoreId + ": " + currentCount);
} catch (ServiceFacadeException e) {
throw new RuntimeException("unable to instantiate MDStore service", e);
} catch (SAXException e) {
throw new RuntimeException("unable to parse dataset record", e);
}
}
}