package eu.dnetlib.iis.wf.ingest.pmc.metadata;
import java.io.IOException;
import java.io.StringReader;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.avro.mapred.AvroKey;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.Namespace;
import org.jdom.input.SAXBuilder;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import eu.dnetlib.iis.audit.schemas.Fault;
import eu.dnetlib.iis.common.WorkflowRuntimeParameters;
import eu.dnetlib.iis.common.fault.FaultUtils;
import eu.dnetlib.iis.common.javamapreduce.MultipleOutputs;
import eu.dnetlib.iis.ingest.pmc.metadata.schemas.ExtractedDocumentMetadata;
import eu.dnetlib.iis.metadataextraction.schemas.DocumentText;
import eu.dnetlib.iis.wf.ingest.pmc.plaintext.NlmToDocumentTextConverter;
/**
* @author Michal Oniszczuk (m.oniszczuk@icm.edu.pl)
* @author mhorst
*/
public class MetadataImporter extends Mapper<AvroKey<DocumentText>, NullWritable, NullWritable, NullWritable> {
protected static final Logger log = Logger.getLogger(MetadataImporter.class);
public static final String NAMED_OUTPUT_META = "output.meta";
public static final String NAMED_OUTPUT_FAULT = "output.fault";
public static final String EXCLUDED_IDS = "excluded.ids";
public static final String FAULT_TEXT = "text";
public static final String PARAM_INGEST_METADATA_OAI_NAMESPACE = "ingest.metadata.oai.element.namespace";
private Namespace oaiNamespace;
/**
* Multiple outputs.
*/
private MultipleOutputs mos;
/**
* Document metadata named output.
*/
private String namedOutputMeta;
/**
* Fault named output.
*/
private String namedOutputFault;
/**
* Set of object identifiers objects excluded from processing.
*/
private Set<String> excludedIds = Collections.emptySet();
// ----------------------------- LOGIC ----------------------------------------
@Override
protected void setup(Mapper<AvroKey<DocumentText>, NullWritable, NullWritable, NullWritable>.Context context)
throws IOException, InterruptedException {
namedOutputMeta = context.getConfiguration().get(NAMED_OUTPUT_META);
if (namedOutputMeta == null || namedOutputMeta.isEmpty()) {
throw new RuntimeException("no named output provided for metadata");
}
namedOutputFault = context.getConfiguration().get(NAMED_OUTPUT_FAULT);
if (namedOutputFault == null || namedOutputFault.isEmpty()) {
throw new RuntimeException("no named output provided for fault");
}
mos = instantiateMultipleOutputs(context);
oaiNamespace = Namespace.getNamespace(context.getConfiguration().get(PARAM_INGEST_METADATA_OAI_NAMESPACE,
"http://www.openarchives.org/OAI/2.0/"));
String excludedIdsCSV = context.getConfiguration().get(EXCLUDED_IDS);
if (excludedIdsCSV != null && !excludedIdsCSV.trim().isEmpty()
&& !WorkflowRuntimeParameters.UNDEFINED_NONEMPTY_VALUE.equals(excludedIdsCSV)) {
log.info("got excluded ids: " + excludedIdsCSV);
excludedIds = new HashSet<String>(Arrays.asList(StringUtils.split(excludedIdsCSV.trim(), ',')));
} else {
log.info("got no excluded ids");
}
}
/**
* Instantiates {@link MultipleOutputs} instance.
*/
protected MultipleOutputs instantiateMultipleOutputs(Context context) {
return new MultipleOutputs(context);
}
@Override
protected void map(AvroKey<DocumentText> key, NullWritable value, Context context)
throws IOException, InterruptedException {
DocumentText nlm = key.datum();
String documentId = nlm.getId().toString();
if (excludedIds.contains(documentId)) {
log.info("skipping processing for excluded id " + documentId);
return;
}
if (!StringUtils.isBlank(nlm.getText())) {
final ExtractedDocumentMetadata.Builder output = ExtractedDocumentMetadata.newBuilder();
output.setId(nlm.getId());
try {
String pmcXml = nlm.getText().toString();
output.setText(extractText(pmcXml, oaiNamespace));
extractMetadata(pmcXml, output);
mos.write(namedOutputMeta, new AvroKey<ExtractedDocumentMetadata>(output.build()));
} catch (Exception e) {
handleException(nlm, e, output);
}
}
}
/**
* Extracts plain text from given xml input.
*
* @param xmlInput
* @param oaiNamespace
* @return plaintext extracted from xml input
* @throws JDOMException
* @throws IOException
*/
protected static CharSequence extractText(String xmlInput, Namespace oaiNamespace)
throws JDOMException, IOException {
SAXBuilder builder = new SAXBuilder();
builder.setValidation(false);
builder.setFeature("http://xml.org/sax/features/validation", false);
builder.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
builder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
StringReader textReader = new StringReader(xmlInput);
Document document = builder.build(textReader);
Element sourceDocument = document.getRootElement();
return NlmToDocumentTextConverter.getDocumentText(sourceDocument, oaiNamespace);
}
/**
* Extracts metadata from given xml input by supplementing metada in output
* builder.
*
* @param xmlInput
* @param output
* @throws ParserConfigurationException
* @throws SAXException
* @throws IOException
*/
protected static void extractMetadata(String xmlInput, ExtractedDocumentMetadata.Builder output)
throws ParserConfigurationException, SAXException, IOException {
SAXParserFactory saxFactory = SAXParserFactory.newInstance();
saxFactory.setValidating(false);
SAXParser saxParser = saxFactory.newSAXParser();
XMLReader reader = saxParser.getXMLReader();
reader.setFeature("http://xml.org/sax/features/validation", false);
reader.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
reader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
JatsXmlHandler pmcXmlHandler = new JatsXmlHandler(output);
saxParser.parse(new InputSource(new StringReader(xmlInput)), pmcXmlHandler);
}
/**
* Handles exception by writing it as fault.
*
* @param documentText source xml content
* @param e thrown exception
* @param builder {@link ExtractedDocumentMetadata} builder
* @throws IOException
* @throws InterruptedException
*/
protected void handleException(DocumentText documentText, Exception e,
ExtractedDocumentMetadata.Builder builder) throws IOException, InterruptedException {
// writing empty result, setting required fields first
if (!builder.hasText()) {
builder.setText("");
}
if (!builder.hasEntityType()) {
builder.setEntityType(JatsXmlHandler.ENTITY_TYPE_UNKNOWN);
}
mos.write(namedOutputMeta,
new AvroKey<ExtractedDocumentMetadata>(builder.build()));
// writing fault result
Map<CharSequence, CharSequence> auditSupplementaryData = new HashMap<CharSequence, CharSequence>();
auditSupplementaryData.put(FAULT_TEXT, documentText.getText());
mos.write(namedOutputFault,
new AvroKey<Fault>(FaultUtils.exceptionToFault(documentText.getId(), e, auditSupplementaryData)));
}
/*
* (non-Javadoc)
*
* @see
* org.apache.hadoop.mapreduce.Mapper#cleanup(org.apache.hadoop.mapreduce.
* Mapper.Context)
*/
@Override
public void cleanup(Context context) throws IOException, InterruptedException {
log.debug("cleanup: closing multiple outputs...");
mos.close();
log.debug("cleanup: multiple outputs closed");
}
}